General Series Knowledge

In [14]:
import pandas as pd

In [15]:
students = ["Alice", "Jack", 'Molly']
pd.Series(students)

0    Alice
1     Jack
2    Molly
dtype: object

In [16]:
#Series seems to return a cross between a list and a dict where each entry is labelled in increasing order starting from 0

In [17]:
students = ["Alice", "Ben", None]
pd.Series(students)

0    Alice
1      Ben
2     None
dtype: object

In [18]:
students = [10, 11]
pd.Series(students)

0    10
1    11
dtype: int64

In [19]:
students = [10, 11, None]
pd.Series(students)

0    10.0
1    11.0
2     NaN
dtype: float64

In [20]:
#None is converted to NaN when the rest of the dtypes in the series are number-like, and NaN is stored as a float, so it has turned the series of two integers from int64 to float64.

In [21]:
#In pandas, NaN is not the same as None.

In [22]:
import numpy as np
np.nan == None

False

In [23]:
np.nan == np.nan

False

In [24]:
np.isnan(np.nan)

True

In [25]:
# NaN is a numeric value and not the same as None.

In [27]:
#Series can be created directly from dictionary data.
students_scores = {'Pedro': "Math", "Juan":"Science","Ben":"Stats"}
s = pd.Series(students_scores)
s

Pedro       Math
Juan     Science
Ben        Stats
dtype: object

In [None]:
#Here, we can see that the indexes that we saw in the first few series have been turned to strings, corresponding to the key-value pairs found in the original dictionary.

In [28]:
s.index

Index(['Pedro', 'Juan', 'Ben'], dtype='object')

In [29]:
s['Pedro']

'Math'

In [None]:
#We can access these key-value pairs in the same way that we would a dictionary, and if this were converted from a list, you could indx and slice this in the exact same way that you would for list.

In [31]:
students = [("Alice","Red"),("Benicio","Green"),("Nate","Yellow")]
pd.Series(students)

0        (Alice, Red)
1    (Benicio, Green)
2      (Nate, Yellow)
dtype: object

In [None]:
# Here, each tuple is stores as a series object, with the dtype object

In [34]:
s = pd.Series(["Physics","Chemistry","English"], index = ["Alice","Ben", "Pepe"])
s

Alice      Physics
Ben      Chemistry
Pepe       English
dtype: object

In [37]:
# This only works if the indexes and values have the same amount of elements in them.

In [42]:
students_scores = {"Ben":"Chem", "Roy":"Y-Combinator", "Mino":"UPenn Wharton"}
s = pd.Series(students_scores, index = ["Ben", "Roy", "Neel"])
s

Ben             Chem
Roy     Y-Combinator
Neel             NaN
dtype: object

In [None]:
# Since we defined the indexes of the values to be Ben, Roy and Neel, and there is no key corresponding to Neel in the dictionary we fed for the series, pandas assigns it the value of NaN

Querying Series

In [None]:
# A pandas series can be queried by the index position and by the index label. When the index doesnt exist, querying by the position and label is the same.
# To query by numeric location, you can use iloc, which starts the query at 0. To query by label, you can use the loc attribute.

In [45]:
import pandas as pd
students_classes = {"Ben":"Math", "Jon":"Stats", "Pedro": "pandas","Jose":"Am i ever gonna get good at typing fast???"}
s = pd.Series(students_classes)
s

Ben                                            Math
Jon                                           Stats
Pedro                                        pandas
Jose     Am i ever gonna get good at typing fast???
dtype: object

In [46]:
s.iloc[3]


'Am i ever gonna get good at typing fast???'

In [47]:
s.loc["Jose"]

'Am i ever gonna get good at typing fast???'

In [None]:
#As we can see, these two refer to the same value.
# loc and iloc are not methods, they are attributes.
#Also, we dont need to use loc and iloc, if we index with an int in as the index, we will index like iloc, if we have a string, it will work like loc.

In [49]:
class_code = {1003:"Stats",3003: "Data Science", 4004:"Intro to Machine Learning and Artificial Intelligence"}
s = pd.Series(class_code)
s

1003                                                Stats
3003                                         Data Science
4004    Intro to Machine Learning and Artificial Intel...
dtype: object

In [50]:
s[0]

KeyError: 0

In [None]:
# That doesnt work because the keys in the dict are also integers, and so pandas cannot distinguish between it being an iloc search or it trying to pull up the label index, thats why its best practice to use iloc and loc.

Working with Data

In [54]:
grades = pd.Series([91,81,92,75,88])
# We wanna get the average of this dataset.

AttributeError: module 'numpy' has no attribute 'avg'

In [56]:
total = np.sum(grades)
print(total/len(grades))

85.4


In [59]:
numbers = pd.Series(np.random.randint(0, 10000,10000))
numbers

0       7069
1       4068
2       8384
3       8094
4       2433
        ... 
9995    1412
9996    2440
9997    4344
9998    3417
9999    6993
Length: 10000, dtype: int64

In [None]:
#So here we have created a series with 10000 entries that has random numbers ranging from 1 to 10000. Very cool.

In [60]:
numbers.head()

0    7069
1    4068
2    8384
3    8094
4    2433
dtype: int64

In [None]:
# head will return the top five items in the series.


In [61]:
%%timeit -n 100
total =0
for number in numbers:
    total += number
total/ len(numbers)

1.03 ms ± 29.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [63]:
%%timeit -n 100
total = np.sum(numbers)
total/len(numbers)

54.6 µs ± 10.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# Damn thats a lot faster, this is using parallel computing, which makes it insanely faster than the normal non native function way.


In [64]:
numbers.head()

0    7069
1    4068
2    8384
3    8094
4    2433
dtype: int64

In [65]:
numbers += 2
numbers.head()

0    7071
1    4070
2    8386
3    8096
4    2435
dtype: int64

In [None]:
# Like with matrices, we can add scalars to the whole series in one go.

In [66]:
for label, value in numbers.iteritems():
    numbers.set_value(label,value+2)
    
numbers.head()
    

AttributeError: 'Series' object has no attribute 'set_value'

In [None]:
# set_value is dead unfortunately.
# Despite set_value being dead, the whole point of this is the fact that the integrated np and pd functions are always gonna be faster than the ones that you can try to make yourself.

In [None]:
# The .loc attribute lets us edit series data in place, and also add data to our datasets as well, if the value that we pass as an index doesnt exist ,then pandas adds new value. Indices can have mixed types.

In [None]:
# pandas will automatically change the underlying np types as appropriate.

In [67]:
s = pd.Series([1,2,3])
s.loc["Math"] =102
s

0         1
1         2
2         3
Math    102
dtype: int64

In [None]:
# here, pandas added the element in place.

In [68]:
# Lets create a Series where we have repeated index values.

students_classes = pd.Series({"Alice":"Physics", "Ben":"Math","Ben":"Why tf am i so slow at typing am i stupid??"})
students_classes

Alice                                        Physics
Ben      Why tf am i so slow at typing am i stupid??
dtype: object

In [None]:
# Here pandas fully killed Math, it absolutely ignored it, instead of creating a new index for Ben with it showing up right above thge one where im complaining about my typing.