# Week 4 - Numpy and Pandas

In [1]:
import pandas as pd
import numpy as np
import random

## Numpy
* NumPy gives Python fast, homogeneous n-dimensional arrays.
* NumPy arrays are distinct from Python's base `list`

In [2]:
# Heterogeneous `list`s are A-OK
# We can easily mix types with the built-in Python list system
python_list = [True, 0, 'my string', {1:2, 3:4}]

In [3]:
# NumPy arrays most performant when they're homogenous
numpy_arr = np.array([1, 2, 3, 4, 5])
print(numpy_arr)
print(numpy_arr.dtype)

[1 2 3 4 5]
int64


In [4]:
# If we pass more than one type to a NumPy array, everything becomes an 'object'
# :(
# (This is OK, but we use NumPy for fast things, not objects)
numpy_fail = np.array(python_list)
print(numpy_fail.dtype)

object


In [5]:
# NumPy allows you to do operations on entire arrays at once
my_arr = np.arange(100)
print("Just some integers: " + str(my_arr[:10]))

my_arr += 10
print("Integers plus 10: " + str(my_arr[:10]))

my_arr %= 3
print("Integers plus 10 mod 3: " + str(my_arr[:10]))

Just some integers: [0 1 2 3 4 5 6 7 8 9]
Integers plus 10: [10 11 12 13 14 15 16 17 18 19]
Integers plus 10 mod 3: [1 2 0 1 2 0 1 2 0 1]


In [6]:
# NumPy also gives you some helper functions
my_arr = np.array([random.randint(-1000, 1000) for _ in range(10)])
print(my_arr)
print(my_arr.max())
print(my_arr.min())
print(np.average(my_arr))
print(np.median(my_arr))
print(np.cumsum(my_arr))
print(np.cumprod(my_arr))


[ 650 -221  687 -801 -221 -398 -348 -192 -295 -411]
687
-801
-155.0
-258.0
[  650   429  1116   315    94  -304  -652  -844 -1139 -1550]
[                 650              -143650            -98687550
          79048727550      -17469768788550     6952967977842900
 -2419632856289329200  3400906564812416000 -7143256639346932736
  2846171051770647552]


In [7]:
# NumPy also supports multidimensional arrays
arr_2d = np.array([np.arange(10 * i, 10 * i + 10) for i in range(10)])
print(arr_2d)

[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]
 [20 21 22 23 24 25 26 27 28 29]
 [30 31 32 33 34 35 36 37 38 39]
 [40 41 42 43 44 45 46 47 48 49]
 [50 51 52 53 54 55 56 57 58 59]
 [60 61 62 63 64 65 66 67 68 69]
 [70 71 72 73 74 75 76 77 78 79]
 [80 81 82 83 84 85 86 87 88 89]
 [90 91 92 93 94 95 96 97 98 99]]


In [8]:
# Arrays can also be easily 'reshaped'
print(arr_2d.reshape(20, 5))
print(arr_2d.reshape(2, 50))

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]
 [25 26 27 28 29]
 [30 31 32 33 34]
 [35 36 37 38 39]
 [40 41 42 43 44]
 [45 46 47 48 49]
 [50 51 52 53 54]
 [55 56 57 58 59]
 [60 61 62 63 64]
 [65 66 67 68 69]
 [70 71 72 73 74]
 [75 76 77 78 79]
 [80 81 82 83 84]
 [85 86 87 88 89]
 [90 91 92 93 94]
 [95 96 97 98 99]]
[[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
  24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
  48 49]
 [50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
  74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
  98 99]]


In [9]:
# We can do slices on 1d and 2d numpy arrays
print(np.arange(10)[5:])
print(np.arange(100).reshape(10, 10)[5:,5:])

[5 6 7 8 9]
[[55 56 57 58 59]
 [65 66 67 68 69]
 [75 76 77 78 79]
 [85 86 87 88 89]
 [95 96 97 98 99]]


In [10]:
# We can also do slices on n-dimensional arrays.
# (This gets confusing, but is *extremely* useful for high-dimensioned
#  datasets)
arr_4d = np.arange(1000).reshape(20,5,10)
print(arr_4d[9:,3:,7:])

[[[487 488 489]
  [497 498 499]]

 [[537 538 539]
  [547 548 549]]

 [[587 588 589]
  [597 598 599]]

 [[637 638 639]
  [647 648 649]]

 [[687 688 689]
  [697 698 699]]

 [[737 738 739]
  [747 748 749]]

 [[787 788 789]
  [797 798 799]]

 [[837 838 839]
  [847 848 849]]

 [[887 888 889]
  [897 898 899]]

 [[937 938 939]
  [947 948 949]]

 [[987 988 989]
  [997 998 999]]]


In [11]:
# Deep copying of arrays can be important
a = np.arange(10)
b = np.copy(a)
a += 10
print(a)
print(b)

[10 11 12 13 14 15 16 17 18 19]
[0 1 2 3 4 5 6 7 8 9]


In [12]:
# Universal functions are pretty cool
a = np.arange(10)
b = (10 * a) + 5
print(b)

c = np.sin(a / np.pi)
print(c)

[ 5 15 25 35 45 55 65 75 85 95]
[ 0.          0.3129618   0.59448077  0.81627311  0.95605566  0.99978466
  0.94306673  0.79160024  0.5606028   0.2732824 ]


## Pandas
* Allows us to work on labeled data very quickly
* Mirrors a lot of the practices of core Python, but with a focus on data
* Has a very *fast* backend to perform operations much quicker than our naive approaches

In [13]:
# 'Series' in Python can be thought of kindof like NumPy arrays
s = pd.Series(np.arange(10, 20))
s
# Notice how 's' is indexed:

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

In [14]:
# We can do similar array-level operations as NumPy
s += 10
print(s)
s /= 5
print(s)

0    20
1    21
2    22
3    23
4    24
5    25
6    26
7    27
8    28
9    29
dtype: int64
0    4.0
1    4.2
2    4.4
3    4.6
4    4.8
5    5.0
6    5.2
7    5.4
8    5.6
9    5.8
dtype: float64


In [15]:
# The core of Pandas is the DataFrame
# DataFrame can be thought of like interactive SQL tables.
# We can select by columns, index and reindex the data, do groupby operations,
# and many, many other cool things

my_df = pd.DataFrame({
        'name': ('Tyler', 'Ben'),
        'school': ('UIUC', 'UIUC'),
        'likes_data': (True, True)
    })
print(my_df)
print(my_df.dtypes) # Series have dtypes just like NumPy

   likes_data   name school
0        True  Tyler   UIUC
1        True    Ben   UIUC
likes_data      bool
name          object
school        object
dtype: object


In [16]:
other_df = pd.DataFrame({
        'student_id': [random.randint(0, 10000) for _ in range(100)],
        'gpa': [random.random() * 4 for _ in range(100)],
        'major': [random.choice(['CS', 'CS+Math', 'Stats']) for _ in range(100)],
        'in_state': [random.choice([True, False]) for _ in range(100)]
    })
other_df.head() 

Unnamed: 0,gpa,in_state,major,student_id
0,2.623097,False,Stats,3479
1,2.180005,False,Stats,537
2,2.463503,True,Stats,5915
3,1.802576,True,CS,694
4,0.421617,True,Stats,806


In [17]:
other_df.describe()

Unnamed: 0,gpa,student_id
count,100.0,100.0
mean,1.99869,5155.26
std,1.147481,2946.179586
min,0.034242,148.0
25%,1.01968,2394.0
50%,1.934489,5413.0
75%,2.890551,7590.0
max,3.982447,9998.0


In [18]:
# We can drill-down into columns easily
other_df.gpa
other_df['gpa']

0     2.623097
1     2.180005
2     2.463503
3     1.802576
4     0.421617
5     0.544013
6     3.485035
7     1.378355
8     2.416805
9     2.340469
10    1.714432
11    0.503847
12    3.398161
13    0.315430
14    3.891057
15    2.538732
16    3.896839
17    2.686425
18    1.118831
19    1.276159
20    0.968782
21    2.052955
22    2.589050
23    0.240767
24    2.824982
25    1.218358
26    1.711158
27    1.148392
28    1.005930
29    0.273764
        ...   
70    3.260490
71    3.121905
72    0.229393
73    3.000053
74    3.898802
75    0.149617
76    3.247611
77    3.982447
78    2.280402
79    3.491413
80    1.008194
81    3.136477
82    2.774910
83    0.050553
84    2.709089
85    3.602849
86    1.997771
87    1.090369
88    2.614596
89    1.714943
90    1.590635
91    1.857767
92    3.516696
93    1.117026
94    2.412539
95    1.736036
96    0.133303
97    1.923266
98    3.850907
99    3.627201
Name: gpa, Length: 100, dtype: float64

In [19]:
# We can select by column values easily too
print(other_df[other_df.major == 'CS'])
print(other_df[other_df.gpa > 3])

         gpa  in_state major  student_id
3   1.802576      True    CS         694
6   3.485035      True    CS        4934
9   2.340469     False    CS        9711
11  0.503847     False    CS        6953
13  0.315430      True    CS        3528
16  3.896839      True    CS        9513
21  2.052955     False    CS         339
25  1.218358     False    CS        4895
30  3.250536     False    CS        6372
33  2.778147      True    CS        8940
35  0.453179      True    CS        5758
38  0.176322      True    CS        6960
39  3.892429     False    CS        8463
40  3.588057     False    CS        5420
44  3.340625      True    CS        4858
45  2.317205     False    CS        9085
46  2.098023     False    CS        9613
48  0.526657     False    CS        8429
50  3.102048      True    CS        3788
51  3.684291     False    CS        5478
53  0.982049     False    CS        8383
56  1.652635     False    CS        4062
57  0.034242      True    CS        6044
58  0.157345    

In [20]:
# We can also combine this with a groupby to get more interesting stats
print(other_df.groupby('major').mean().gpa)

major
CS         1.946366
CS+Math    2.015003
Stats      2.037679
Name: gpa, dtype: float64


In [21]:
# We can combine selections too
other_df[(other_df.gpa > 3) & (other_df.major == 'Stats')]

Unnamed: 0,gpa,in_state,major,student_id
14,3.891057,True,Stats,1826
36,3.106897,True,Stats,8410
49,3.719439,True,Stats,2748
70,3.26049,False,Stats,5599
71,3.121905,True,Stats,2519
73,3.000053,False,Stats,3990
74,3.898802,False,Stats,9665


In [22]:
# It can sometimes be useful to alter the index of a dataframe
other_df.set_index('student_id')

Unnamed: 0_level_0,gpa,in_state,major
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3479,2.623097,False,Stats
537,2.180005,False,Stats
5915,2.463503,True,Stats
694,1.802576,True,CS
806,0.421617,True,Stats
2933,0.544013,False,CS+Math
4934,3.485035,True,CS
3652,1.378355,True,Stats
9685,2.416805,False,CS+Math
9711,2.340469,False,CS


## Challanges:

In [2]:
titanic_df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv')
titanic_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [45]:
titanic_df.groupby('class')['survived'].sum() / titanic_df.groupby('class').count()['survived']

class
First     0.629630
Second    0.472826
Third     0.242363
Name: survived, dtype: float64

In [46]:
titanic_df.groupby('class').count()

Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,adult_male,deck,embark_town,alive,alone
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
First,216,216,216,186,216,216,216,214,216,216,175,214,216,216
Second,184,184,184,173,184,184,184,184,184,184,16,184,184,184
Third,491,491,491,355,491,491,491,491,491,491,12,491,491,491


1) Calculate the survival rates of passengers by class (First, Second, Third)


In [108]:
titanic_df[titanic_df.survived == 1].groupby('class')['survived'].sum()/titanic_df[titanic_df.survived == 1].groupby('class')['survived'].count()

class
First     1
Second    1
Third     1
Name: survived, dtype: int64

2) Calculate the average fare paid by those who survived compared to the fare paid by those who didn't


3) Plot the ages of the female survivors that embarked at Cherbourg