In [1]:
# A Pandas Series can be queried either by the index position or the index label. If you don't give an index to the series
# when quering the position and the label are effectively the same value
# To query by numeric location, starting at zero, use the iloc attribute. To query by the index label,
# we can use loc attribute.

# Example
import pandas as pd
students_classes = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'}
s = pd.Series(students_classes)
s


Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [2]:
# For this Series, if we want to see the fourth entry we would use the 'iloc' attribute with the parameter 3
s.iloc[3]

'History'

In [3]:
# If we want to see what class MOlly has, we would use 'loc' attribute with a parameter of Molly
s.loc['Molly']

'English'

In [4]:
# 'iloc' and 'loc' are not method, they are attributes. So, we don't use parentheses to query them but squared brackets instead,
# which is called the indexing operator.
# In Python this calls get or set for an item depending on the context of its use.

In [5]:
# Pandas tries to make our code a nit more readable and provides a sort  of smart syntax using the indexing operator directly
# on the  series itself. For instance, if we pass in an integer parameter the operator will bejave as if we want it to query 
# via the iloc attribute.
s[3]

'History'

In [6]:
# If we pass in an object, it will query as if we wanted to use the label based loc attribute
s['Molly']

'English'

In [7]:
# A common task is to want to consider all of the values inside of a series and do some sort of operation. This could be trying 
# to find a certain number or summarizing data or transforming the data in some way.

In [8]:
# A typical programmatic approach to this would be iterate over all the items in the series and invoke the operation.

grades = pd.Series([90, 80, 70, 60])
total = 0
for grade in grades:
    total+=grade
print(total/len(grades))

75.0


In [9]:
# Pandas and the underlying numpy libraries support a method of computation called vectorizatoin.
# Vectorizatoin works with most of the functions in the numpy library including the sum function

In [10]:
# here is how we would really write the code using the numpy sum method.

import numpy as np

# Then we just call np.sum and pass in an iterable item. In this case, our pandas series

total = np.sum(grades)
print(total/len(grades))

75.0


In [11]:
# Both of these methods create the same value

# Lets create a big series of random numbers. This is used to demonstrate techniques with Pandas

numbers = pd.Series(np.random.randint(0,1000,10000))

# Now lets look at the top five items in the series

numbers.head()

0    882
1    192
2    508
3    611
4    839
dtype: int32

In [12]:
# We can verify that length of the series is correct using len function
len(numbers)

10000

In [13]:
# The ipython interpreter has something called magic functions and it begins with a percentage sign.
# By typing this sign and hitting Tab key, we can see a list of the available magic functions.
# The function will run our code a few times to determine, on average, how long it takes
# lets run timeit with our original iterative sode. We can give timeit the number of loops that we would like to run.
# we are actually going to use what's called a cellular magic function, these starts with two percentage signs and wrap the
# code in the current Jupyter cell.

In [14]:
%%timeit -n 100
total = 0
for number in numbers:
    total+=number
    
total/len(numbers)

1.1 ms ± 29.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%%timeit -n 100
total = np.sum(numbers)
total/len(numbers)

77.6 µs ± 16.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
# There is significant difference in the speed and demonstrate why one should be aware of parallel computing features and start
# thinking in functional programming terms. 
# Vectorizatoin is the ability for a computer to execute multiple instructions at once and with high performance chips
# especially graphics cards.

In [17]:
# A related feature in pandas and numpy is called broadcasting. With broadcasting, we can apply an operation to every value
# in the series, changing the series. For instancxe, if we wanted to increase every random variable by 2, we could do so quickly
# using the += operator directly on the series object

# lets look at the head of our series
numbers.head()

0    882
1    192
2    508
3    611
4    839
dtype: int32

In [18]:
# And now lets increase everything in the series by 2
numbers+=2
numbers.head()

0    884
1    194
2    510
3    613
4    841
dtype: int32

In [19]:
# The procedual way of doing this would be to iterate through all of items in the series and increase the values directly.
# Pandas does support iterating through a series much like a dictionary, allowing us to unpack values easily.

# We can use the iteritems() function which returns a label and value
for label, value in numbers.iteritems():
    # now for the item which is returned, lets call .at()
     numbers.at[label]
    # and we can chake rhe result as this computaion
numbers.head()

0    884
1    194
2    510
3    613
4    841
dtype: int32

In [20]:
# Lets look at some speed comparisons. First try five loops using the iterative approach

In [None]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
for label, value in s.iteritems():
    s.loc[label]= value+2

In [22]:
# now try that using the broadcasting methods

In [23]:
%%timeit -n 10
# We need to recreate a series
s = pd.Series(np.random.randint(0,1000,10000))
# And we just broadcast with +=
s+2

257 µs ± 43.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
# Not only it is significantly faster, but it's more consice and even easier to read too.

In [25]:
# Here is an example using a Series of few numbers
s = pd.Series([1,2,3])

# We could add some new value, maybe a university course
s.loc['History'] = 102
s

0            1
1            2
2            3
History    102
dtype: int64

In [26]:
# We see that mixed types for data values or index labels are no problem for Pandas. Since, 'History' is not in the original
# list of indices, s.loc['History'] essentially creates a new element in the series.  

In [27]:
# Lets take an example where index values are not uinique and this makes pandas Series a little different conceptually then 
# for instance, a relational database

students_classes = pd.Series({'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'})
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [29]:
# Now create a Series just for some new studeent, which lists all of the cources taken and set the index to name and 
# the data to be the names of courses
matt_classes = pd.Series(['Philosophy', 'Arts', 'Math'], index = ['Matt', 'Matt', 'Matt'])
matt_classes


Matt    Philosophy
Matt          Arts
Matt          Math
dtype: object