Introduction to Pandas
=====================

## Learning Objectives

## Acknowledgements
Some of the examples in this notebook are taken from:
[Python Crash Course - A Hands-on, Project-based, introduction to programming](https://www.amazon.co.uk/Python-Crash-Course-Hands-Project-Based/dp/1593276036), [Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/)
and https://chrisalbon.com/

Data taken from: [Hadley Wickham's Github Repository and R Data Package](https://github.com/hadley/nycflights13)

In [1]:
# Import Modules
import numpy as np
import pandas as pd

In [2]:
# Create Dataframe
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'age': [42, 52, 36, 24, 73], 
        'preTestScore': [4, 24, 31, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70]}

In [3]:
data

{'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
 'age': [42, 52, 36, 24, 73],
 'preTestScore': [4, 24, 31, 2, 3],
 'postTestScore': [25, 94, 57, 62, 70]}

In [4]:
df = pd.DataFrame(data, columns = ['name', 'age', 'preTestScore', 'postTestScore'])

In [5]:
# Convention - df stands for dataframe
df

Unnamed: 0,name,age,preTestScore,postTestScore
0,Jason,42,4,25
1,Molly,52,24,94
2,Tina,36,31,57
3,Jake,24,2,62
4,Amy,73,3,70


In [6]:
# The sum of all the ages
df['age'].sum()

227

In [7]:
# The Average PreTest score
# Note we are pointing to one column and then applying a function to that column.
df['preTestScore'].mean()

12.8

In [8]:
# Note this adds the cumulative score from the first row and heads down   
df['preTestScore'].cumsum()

0     4
1    28
2    59
3    61
4    64
Name: preTestScore, dtype: int64

In [9]:
# All summary statistics for a particular column
df['preTestScore'].describe()

count     5.000000
mean     12.800000
std      13.663821
min       2.000000
25%       3.000000
50%       4.000000
75%      24.000000
max      31.000000
Name: preTestScore, dtype: float64

In [10]:
# Count the number of non NA values
df['preTestScore'].count()

5

In [11]:
# Output the minimum value in this row
df['preTestScore'].min()

2

In [12]:
# Output the maximum value in this row
df['preTestScore'].max()

31

In [13]:
# Output the median value in this row
df['preTestScore'].median()

4.0

In [14]:
df['preTestScore'].var()

186.7

In [15]:
df['preTestScore'].std()

13.663820841916802

In [16]:
df['preTestScore'].skew()

0.7433452457326751

In [17]:
df['preTestScore'].kurt()

-2.4673543738411547

In [18]:
df.corr()

Unnamed: 0,age,preTestScore,postTestScore
age,1.0,-0.105651,0.328852
preTestScore,-0.105651,1.0,0.378039
postTestScore,0.328852,0.378039,1.0


In [19]:
df.cov()

Unnamed: 0,age,preTestScore,postTestScore
age,340.8,-26.65,151.2
preTestScore,-26.65,186.7,128.65
postTestScore,151.2,128.65,620.3


In [20]:
flights = pd.read_csv("flights.csv")

In [21]:
flights.shape

(336776, 20)

In [22]:
flights.head()

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,1,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,3,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,4,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,5,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00


In [23]:
flights.query("month == 1 & day == 1")

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,1,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,3,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,4,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,5,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00
5,6,2013,1,1,554.0,558,-4.0,740.0,728,12.0,UA,1696,N39463,EWR,ORD,150.0,719,5,58,2013-01-01 05:00:00
6,7,2013,1,1,555.0,600,-5.0,913.0,854,19.0,B6,507,N516JB,EWR,FLL,158.0,1065,6,0,2013-01-01 06:00:00
7,8,2013,1,1,557.0,600,-3.0,709.0,723,-14.0,EV,5708,N829AS,LGA,IAD,53.0,229,6,0,2013-01-01 06:00:00
8,9,2013,1,1,557.0,600,-3.0,838.0,846,-8.0,B6,79,N593JB,JFK,MCO,140.0,944,6,0,2013-01-01 06:00:00
9,10,2013,1,1,558.0,600,-2.0,753.0,745,8.0,AA,301,N3ALAA,LGA,ORD,138.0,733,6,0,2013-01-01 06:00:00


In [24]:
flights.iloc[:9]

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,1,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,3,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,4,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,5,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00
5,6,2013,1,1,554.0,558,-4.0,740.0,728,12.0,UA,1696,N39463,EWR,ORD,150.0,719,5,58,2013-01-01 05:00:00
6,7,2013,1,1,555.0,600,-5.0,913.0,854,19.0,B6,507,N516JB,EWR,FLL,158.0,1065,6,0,2013-01-01 06:00:00
7,8,2013,1,1,557.0,600,-3.0,709.0,723,-14.0,EV,5708,N829AS,LGA,IAD,53.0,229,6,0,2013-01-01 06:00:00
8,9,2013,1,1,557.0,600,-3.0,838.0,846,-8.0,B6,79,N593JB,JFK,MCO,140.0,944,6,0,2013-01-01 06:00:00


In [25]:
flights.sort_values(by=['year', 'month', 'day'])

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,1,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,3,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,4,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,5,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00
5,6,2013,1,1,554.0,558,-4.0,740.0,728,12.0,UA,1696,N39463,EWR,ORD,150.0,719,5,58,2013-01-01 05:00:00
6,7,2013,1,1,555.0,600,-5.0,913.0,854,19.0,B6,507,N516JB,EWR,FLL,158.0,1065,6,0,2013-01-01 06:00:00
7,8,2013,1,1,557.0,600,-3.0,709.0,723,-14.0,EV,5708,N829AS,LGA,IAD,53.0,229,6,0,2013-01-01 06:00:00
8,9,2013,1,1,557.0,600,-3.0,838.0,846,-8.0,B6,79,N593JB,JFK,MCO,140.0,944,6,0,2013-01-01 06:00:00
9,10,2013,1,1,558.0,600,-2.0,753.0,745,8.0,AA,301,N3ALAA,LGA,ORD,138.0,733,6,0,2013-01-01 06:00:00


In [26]:
flights.sort_values(by=['year', 'month', 'day'], ascending=False)

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
110520,110521,2013,12,31,13.0,2359,14.0,439.0,437,2.0,B6,839,N566JB,JFK,BQN,189.0,1576,23,59,2013-12-31 23:00:00
110521,110522,2013,12,31,18.0,2359,19.0,449.0,444,5.0,DL,412,N713TW,JFK,SJU,192.0,1598,23,59,2013-12-31 23:00:00
110522,110523,2013,12,31,26.0,2245,101.0,129.0,2353,96.0,B6,108,N374JB,JFK,PWM,50.0,273,22,45,2013-12-31 22:00:00
110523,110524,2013,12,31,459.0,500,-1.0,655.0,651,4.0,US,1895,N557UW,EWR,CLT,95.0,529,5,0,2013-12-31 05:00:00
110524,110525,2013,12,31,514.0,515,-1.0,814.0,812,2.0,UA,700,N470UA,EWR,IAH,223.0,1400,5,15,2013-12-31 05:00:00
110525,110526,2013,12,31,549.0,551,-2.0,925.0,900,25.0,UA,274,N577UA,EWR,LAX,346.0,2454,5,51,2013-12-31 05:00:00
110526,110527,2013,12,31,550.0,600,-10.0,725.0,745,-20.0,AA,301,N3CXAA,LGA,ORD,127.0,733,6,0,2013-12-31 06:00:00
110527,110528,2013,12,31,552.0,600,-8.0,811.0,826,-15.0,EV,3825,N14916,EWR,IND,118.0,645,6,0,2013-12-31 06:00:00
110528,110529,2013,12,31,553.0,600,-7.0,741.0,754,-13.0,DL,731,N333NB,LGA,DTW,86.0,502,6,0,2013-12-31 06:00:00
110529,110530,2013,12,31,554.0,550,4.0,1024.0,1027,-3.0,B6,939,N552JB,JFK,BQN,195.0,1576,5,50,2013-12-31 05:00:00


In [27]:
flights[['year', 'month', 'day']]

Unnamed: 0,year,month,day
0,2013,1,1
1,2013,1,1
2,2013,1,1
3,2013,1,1
4,2013,1,1
5,2013,1,1
6,2013,1,1
7,2013,1,1
8,2013,1,1
9,2013,1,1


In [28]:
flights.rename(columns={'tailnum': 'tail_num'})['tail_num']

0         N14228
1         N24211
2         N619AA
3         N804JB
4         N668DN
5         N39463
6         N516JB
7         N829AS
8         N593JB
9         N3ALAA
10        N793JB
11        N657JB
12        N29129
13        N53441
14        N3DUAA
15        N708JB
16        N76515
17        N595JB
18        N542MQ
19        N644JB
20        N971DL
21        N730MQ
22        N633AA
23        N3739P
24        N53442
25        N9EAMQ
26        N532UA
27        N635JB
28        N794JB
29        N326NB
           ...  
336746    N712EV
336747    N16546
336748    N807JB
336749    N751EV
336750    N807MQ
336751    N335AA
336752    N12957
336753    N633JB
336754    N627JB
336755    N813UA
336756    N10575
336757    N906XJ
336758    N722EV
336759    N532MQ
336760    N12145
336761    N193JB
336762    N578UA
336763    N804JB
336764    N318JB
336765    N354JB
336766    N281JB
336767    N346JB
336768    N565JB
336769    N516JB
336770    N740EV
336771       NaN
336772       NaN
336773    N535

In [29]:
flights.rename(columns={'tailnum': 'tail_num'})

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tail_num,origin,dest,air_time,distance,hour,minute,time_hour
0,1,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,3,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,4,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,5,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00
5,6,2013,1,1,554.0,558,-4.0,740.0,728,12.0,UA,1696,N39463,EWR,ORD,150.0,719,5,58,2013-01-01 05:00:00
6,7,2013,1,1,555.0,600,-5.0,913.0,854,19.0,B6,507,N516JB,EWR,FLL,158.0,1065,6,0,2013-01-01 06:00:00
7,8,2013,1,1,557.0,600,-3.0,709.0,723,-14.0,EV,5708,N829AS,LGA,IAD,53.0,229,6,0,2013-01-01 06:00:00
8,9,2013,1,1,557.0,600,-3.0,838.0,846,-8.0,B6,79,N593JB,JFK,MCO,140.0,944,6,0,2013-01-01 06:00:00
9,10,2013,1,1,558.0,600,-2.0,753.0,745,8.0,AA,301,N3ALAA,LGA,ORD,138.0,733,6,0,2013-01-01 06:00:00


In [30]:
flights.tailnum.unique()

array(['N14228', 'N24211', 'N619AA', ..., 'N776SK', 'N785SK', 'N557AS'],
      dtype=object)

In [31]:
flights[['origin', 'dest']].drop_duplicates()

Unnamed: 0,origin,dest
0,EWR,IAH
1,LGA,IAH
2,JFK,MIA
3,JFK,BQN
4,LGA,ATL
5,EWR,ORD
6,EWR,FLL
7,LGA,IAD
8,JFK,MCO
9,LGA,ORD


In [32]:
flights['gain'] = flights.arr_delay - flights.dep_delay
flights['gain_per_hour'] = flights.gain / (flights.air_time / 60)
flights['speed'] = flights.distance / flights.air_time * 60

In [33]:
flights

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,...,origin,dest,air_time,distance,hour,minute,time_hour,gain,gain_per_hour,speed
0,1,2013,1,1,517.0,515,2.0,830.0,819,11.0,...,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00,9.0,2.378855,370.044053
1,2,2013,1,1,533.0,529,4.0,850.0,830,20.0,...,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00,16.0,4.229075,374.273128
2,3,2013,1,1,542.0,540,2.0,923.0,850,33.0,...,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00,31.0,11.625000,408.375000
3,4,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,...,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00,-17.0,-5.573770,516.721311
4,5,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,...,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00,-19.0,-9.827586,394.137931
5,6,2013,1,1,554.0,558,-4.0,740.0,728,12.0,...,EWR,ORD,150.0,719,5,58,2013-01-01 05:00:00,16.0,6.400000,287.600000
6,7,2013,1,1,555.0,600,-5.0,913.0,854,19.0,...,EWR,FLL,158.0,1065,6,0,2013-01-01 06:00:00,24.0,9.113924,404.430380
7,8,2013,1,1,557.0,600,-3.0,709.0,723,-14.0,...,LGA,IAD,53.0,229,6,0,2013-01-01 06:00:00,-11.0,-12.452830,259.245283
8,9,2013,1,1,557.0,600,-3.0,838.0,846,-8.0,...,JFK,MCO,140.0,944,6,0,2013-01-01 06:00:00,-5.0,-2.142857,404.571429
9,10,2013,1,1,558.0,600,-2.0,753.0,745,8.0,...,LGA,ORD,138.0,733,6,0,2013-01-01 06:00:00,10.0,4.347826,318.695652


In [34]:
flights[['gain', 'gain_per_hour']]

Unnamed: 0,gain,gain_per_hour
0,9.0,2.378855
1,16.0,4.229075
2,31.0,11.625000
3,-17.0,-5.573770
4,-19.0,-9.827586
5,16.0,6.400000
6,24.0,9.113924
7,-11.0,-12.452830
8,-5.0,-2.142857
9,10.0,4.347826


In [35]:
flights.dep_delay.mean()

12.639070257304708

In [36]:
flights.loc[np.random.choice(flights.index, 10)]

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,...,origin,dest,air_time,distance,hour,minute,time_hour,gain,gain_per_hour,speed
240595,240596,2013,6,20,1517.0,1520,-3.0,1819.0,1820,-1.0,...,EWR,AUS,198.0,1504,15,20,2013-06-20 15:00:00,2.0,0.606061,455.757576
78674,78675,2013,11,25,1622.0,1610,12.0,1920.0,1925,-5.0,...,EWR,SEA,345.0,2402,16,10,2013-11-25 16:00:00,-17.0,-2.956522,417.73913
63156,63157,2013,11,8,1824.0,1830,-6.0,2158.0,2200,-2.0,...,JFK,SEA,346.0,2422,18,30,2013-11-08 18:00:00,4.0,0.693642,420.0
77213,77214,2013,11,24,654.0,700,-6.0,816.0,850,-34.0,...,LGA,ORD,120.0,733,7,0,2013-11-24 07:00:00,-28.0,-14.0,366.5
287926,287927,2013,8,9,1045.0,959,46.0,1304.0,1225,39.0,...,JFK,LAS,294.0,2248,9,59,2013-08-09 09:00:00,-7.0,-1.428571,458.77551
217858,217859,2013,5,27,1513.0,1516,-3.0,1808.0,1825,-17.0,...,EWR,SFO,336.0,2565,15,16,2013-05-27 15:00:00,-14.0,-2.5,458.035714
207809,207810,2013,5,16,1247.0,1235,12.0,1529.0,1520,9.0,...,EWR,DFW,199.0,1372,12,35,2013-05-16 12:00:00,-3.0,-0.904523,413.668342
328869,328870,2013,9,22,1524.0,1525,-1.0,1630.0,1642,-12.0,...,EWR,BUF,48.0,282,15,25,2013-09-22 15:00:00,-11.0,-13.75,352.5
316316,316317,2013,9,9,607.0,610,-3.0,844.0,855,-11.0,...,LGA,DFW,179.0,1389,6,10,2013-09-09 06:00:00,-8.0,-2.681564,465.586592
99597,99598,2013,12,18,1759.0,1629,90.0,2051.0,1932,79.0,...,LGA,PBI,139.0,1035,16,29,2013-12-18 16:00:00,-11.0,-4.748201,446.76259


flights.iloc[np.random.randint(0, len(flights), 
                               .1  * len(flights))]

In [37]:
planes_df = flights.groupby('tailnum')
delay = planes_df.agg({"year": "count",
                       "distance": "mean", 
                       "arr_delay": "mean"})
delay.query("year > 20 & distance < 2000")

Unnamed: 0_level_0,year,distance,arr_delay
tailnum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N0EGMQ,371,676.188679,9.982955
N10156,153,757.947712,12.717241
N102UW,48,535.875000,2.937500
N103US,46,535.195652,-6.934783
N104UW,47,535.255319,1.804348
N10575,289,519.702422,20.691450
N105UW,45,524.844444,-0.266667
N107US,41,528.707317,-5.731707
N108UW,60,534.500000,-1.250000
N109UW,48,535.875000,-2.520833


In [38]:
destinations = flights.groupby("dest")
destinations.agg({
    'tailnum': lambda x: len(x.unique()),
    'year': 'count'
}).rename(columns={'tailnum': 'planes',
                  'year': 'flights'})

Unnamed: 0_level_0,planes,flights
dest,Unnamed: 1_level_1,Unnamed: 2_level_1
ABQ,108,254
ACK,58,265
ALB,172,439
ANC,6,8
ATL,1180,17215
AUS,993,2439
AVL,159,275
BDL,186,443
BGR,46,375
BHM,45,297


In [None]:
daily = flights.groupby(['year', 'month', 'day'])
per_day = daily['distance'].count()
per_day

In [None]:
per_month = per_day.groupby(level=['year', 'month']).sum()
per_month

In [None]:
per_year = per_month.sum()
per_year

In [None]:
(
flights.groupby(['year', 'month', 'day'])
    [['arr_delay', 'dep_delay']]
    .mean()
    .query('arr_delay > 30 | dep_delay > 30')
)

Missing values

In [39]:
flights.isnull()

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,...,origin,dest,air_time,distance,hour,minute,time_hour,gain,gain_per_hour,speed
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [40]:
flights.dropna()

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,...,origin,dest,air_time,distance,hour,minute,time_hour,gain,gain_per_hour,speed
0,1,2013,1,1,517.0,515,2.0,830.0,819,11.0,...,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00,9.0,2.378855,370.044053
1,2,2013,1,1,533.0,529,4.0,850.0,830,20.0,...,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00,16.0,4.229075,374.273128
2,3,2013,1,1,542.0,540,2.0,923.0,850,33.0,...,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00,31.0,11.625000,408.375000
3,4,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,...,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00,-17.0,-5.573770,516.721311
4,5,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,...,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00,-19.0,-9.827586,394.137931
5,6,2013,1,1,554.0,558,-4.0,740.0,728,12.0,...,EWR,ORD,150.0,719,5,58,2013-01-01 05:00:00,16.0,6.400000,287.600000
6,7,2013,1,1,555.0,600,-5.0,913.0,854,19.0,...,EWR,FLL,158.0,1065,6,0,2013-01-01 06:00:00,24.0,9.113924,404.430380
7,8,2013,1,1,557.0,600,-3.0,709.0,723,-14.0,...,LGA,IAD,53.0,229,6,0,2013-01-01 06:00:00,-11.0,-12.452830,259.245283
8,9,2013,1,1,557.0,600,-3.0,838.0,846,-8.0,...,JFK,MCO,140.0,944,6,0,2013-01-01 06:00:00,-5.0,-2.142857,404.571429
9,10,2013,1,1,558.0,600,-2.0,753.0,745,8.0,...,LGA,ORD,138.0,733,6,0,2013-01-01 06:00:00,10.0,4.347826,318.695652


In [46]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [47]:
data.isna()

a    False
b     True
c    False
d     True
e    False
dtype: bool

In [48]:
data.dropna()

a    1.0
c    2.0
e    3.0
dtype: float64

In [49]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [50]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
# forward-fill
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [51]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
# forward-fill
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

Joins

In [55]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
display(df1, df2)

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [54]:
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [58]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})


In [60]:
display(df3, df4, pd.merge(df3, df4))

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


Unnamed: 0,group,supervisor
0,Accounting,Carly
1,Engineering,Guido
2,HR,Steve


Unnamed: 0,employee,group,hire_date,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


In [61]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization']})
display(df1, df5, pd.merge(df1, df5))

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,group,skills
0,Accounting,math
1,Accounting,spreadsheets
2,Engineering,coding
3,Engineering,linux
4,HR,spreadsheets
5,HR,organization


Unnamed: 0,employee,group,skills
0,Bob,Accounting,math
1,Bob,Accounting,spreadsheets
2,Jake,Engineering,coding
3,Jake,Engineering,linux
4,Lisa,Engineering,coding
5,Lisa,Engineering,linux
6,Sue,HR,spreadsheets
7,Sue,HR,organization


In [62]:
display(df1, df2, pd.merge(df1, df2, on='employee'))

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014
