In [1]:
%matplotlib inline
from IPython.core.display import HTML
from IPython.display import YouTubeVideo

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

path1 = os.path.join(os.getcwd(),'style-table.css')
path2 = os.path.join(os.getcwd(),'style-notebook.css')

css = open(path1).read() + open(path2).read()
HTML('<style>{}</style>'.format(css))

In [2]:
# pandas can be described as numpy with labels
# attaches more general labels to the rows and columns
# significantly enahnces numpy
# adds joins

### series

In [3]:
# make pandas series from regular list

s = pd.Series([0,2,3,4,5,6],name='Squares')
s

0    0
1    2
2    3
3    4
4    5
5    6
Name: Squares, dtype: int64

In [4]:
# show values

s.values

array([0, 2, 3, 4, 5, 6], dtype=int64)

In [5]:
# show index values

s.index

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [6]:
s[0],s[2]

(0, 3)

In [7]:
s[2:4]

2    3
3    4
Name: Squares, dtype: int64

In [8]:
pop2014 = pd.Series([100,99.3,95.5,93.5,92.4,84.8,84.5,78.9,74.3,72.8],
                    index=['Java','C','C++','Python','C#','PHP','JavaScript','Ruby','R','Matlab'])

In [9]:
pop2014

Java          100.0
C              99.3
C++            95.5
Python         93.5
C#             92.4
PHP            84.8
JavaScript     84.5
Ruby           78.9
R              74.3
Matlab         72.8
dtype: float64

In [10]:
pop2014.index

Index(['Java', 'C', 'C++', 'Python', 'C#', 'PHP', 'JavaScript', 'Ruby', 'R',
       'Matlab'],
      dtype='object')

In [11]:
pop2014.values

array([ 100. ,   99.3,   95.5,   93.5,   92.4,   84.8,   84.5,   78.9,
         74.3,   72.8])

In [12]:
pop2014[0]

100.0

In [13]:
pop2014[1:4]

C         99.3
C++       95.5
Python    93.5
dtype: float64

In [14]:
pop2014['Python']

93.5

In [15]:
# we can even slice using index values

pop2014['C':'Python']

C         99.3
C++       95.5
Python    93.5
dtype: float64

In [16]:
# use iloc for index number slicing

pop2014.iloc[:2]

Java    100.0
C        99.3
dtype: float64

In [17]:
# use loc for index name slicing

pop2014.loc[:'PHP']

Java      100.0
C          99.3
C++        95.5
Python     93.5
C#         92.4
PHP        84.8
dtype: float64

In [18]:
pop2014[pop2014 > 91]

Java      100.0
C          99.3
C++        95.5
Python     93.5
C#         92.4
dtype: float64

In [19]:
pop2015 = pd.Series({'Java': 100,'C': 99.9,'C++': 99.4,'Python': 96.5,'C#': 91.3,
                     'R': 84.8,'PHP': 84.5, 'JavaScript': 83.0, 'Ruby': 76.2, 'Matlab': 72.4})

In [20]:
pop2015

C              99.9
C#             91.3
C++            99.4
Java          100.0
JavaScript     83.0
Matlab         72.4
PHP            84.5
Python         96.5
R              84.8
Ruby           76.2
dtype: float64

### dataframes

In [8]:
pop2014 = pd.Series([100,99.3,95.5,93.5,92.4,84.8,84.5,78.9,74.3,72.8],
                    index=['Java','C','C++','Python','C#','PHP','JavaScript','Ruby','R','Matlab'])

In [19]:
pop2015 = pd.Series({'Java': 100,'C': 99.9,'C++': 99.4,'Python': 96.5,'C#': 91.3,
                     'R': 84.8,'PHP': 84.5, 'JavaScript': 83.0, 'Ruby': 76.2, 'Matlab': 72.4})

In [26]:
pop_combined = pd.DataFrame({'2014':pop2014,'2015':pop2015})
pop_combined

Unnamed: 0,2014,2015
C,99.3,99.9
C#,92.4,91.3
C++,95.5,99.4
Java,100.0,100.0
JavaScript,84.5,83.0
Matlab,72.8,72.4
PHP,84.8,84.5
Python,93.5,96.5
R,74.3,84.8
Ruby,78.9,76.2


In [29]:
pop_combined = pop_combined.sort_values('2015',ascending=False)
pop_combined

Unnamed: 0,2014,2015
Java,100.0,100.0
C,99.3,99.9
C++,95.5,99.4
Python,93.5,96.5
C#,92.4,91.3
R,74.3,84.8
PHP,84.8,84.5
JavaScript,84.5,83.0
Ruby,78.9,76.2
Matlab,72.8,72.4


In [34]:
pop_combined.iloc[:2]

Unnamed: 0,2014,2015
Java,100.0,100.0
C,99.3,99.9


In [35]:
pop_combined.loc[:'C']

Unnamed: 0,2014,2015
Java,100.0,100.0
C,99.3,99.9


In [31]:
pop_combined.values

array([[ 100. ,  100. ],
       [  99.3,   99.9],
       [  95.5,   99.4],
       [  93.5,   96.5],
       [  92.4,   91.3],
       [  74.3,   84.8],
       [  84.8,   84.5],
       [  84.5,   83. ],
       [  78.9,   76.2],
       [  72.8,   72.4]])

In [32]:
pop_combined.index

Index(['Java', 'C', 'C++', 'Python', 'C#', 'R', 'PHP', 'JavaScript', 'Ruby',
       'Matlab'],
      dtype='object')

In [33]:
pop_combined.columns

Index(['2014', '2015'], dtype='object')

In [50]:
pop_combined['average'] = (pop_combined['2014']+pop_combined['2015']) / (len(pop_combined.columns)-1)
pop_combined

Unnamed: 0,2014,2015,average
Java,100.0,100.0,100.0
C,99.3,99.9,99.6
C++,95.5,99.4,97.45
Python,93.5,96.5,95.0
C#,92.4,91.3,91.85
R,74.3,84.8,79.55
PHP,84.8,84.5,84.65
JavaScript,84.5,83.0,83.75
Ruby,78.9,76.2,77.55
Matlab,72.8,72.4,72.6


In [62]:
presidents = pd.DataFrame([{'name': 'Barack Obama','inauguration': 2009,'birthyear': 1961},
                          {'name': 'George W. Bush','inauguration': 2001,'birthyear': 1946},
                          {'name': 'Bill Clinton','birthyear': 1946,'inauguration': 1993},
                          {'name': 'George H. W. Bush','inauguration': 1989,'birthyear': 1924}])

In [63]:
presidents

Unnamed: 0,birthyear,inauguration,name
0,1961,2009,Barack Obama
1,1946,2001,George W. Bush
2,1946,1993,Bill Clinton
3,1924,1989,George H. W. Bush


In [70]:
presidents_indexes = presidents.set_index('name')

In [71]:
presidents_indexes

Unnamed: 0_level_0,birthyear,inauguration
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Barack Obama,1961,2009
George W. Bush,1946,2001
Bill Clinton,1946,1993
George H. W. Bush,1924,1989


In [72]:
presidents_indexes.loc['Bill Clinton']

birthyear       1946
inauguration    1993
Name: Bill Clinton, dtype: int64

In [73]:
presidents_indexes.loc['Bill Clinton']['inauguration']

1993

In [75]:
presidents_indexes['inauguration']['Bill Clinton']

1993

### merging dataframes

In [76]:
presidents_fathers = pd.DataFrame([{'son': 'Barack Obama','father': 'Barack Obama, Sr.'},
                                   {'son': 'George W. Bush','father': 'George H. W. Bush'},
                                   {'son': 'George H. W. Bush','father': 'Prescott Bush'}])

In [77]:
pd.merge(presidents,presidents_fathers,left_on='name',right_on='son')

Unnamed: 0,birthyear,inauguration,name,father,son
0,1961,2009,Barack Obama,"Barack Obama, Sr.",Barack Obama
1,1946,2001,George W. Bush,George H. W. Bush,George W. Bush
2,1924,1989,George H. W. Bush,Prescott Bush,George H. W. Bush


In [80]:
pd.merge(presidents,presidents_fathers,left_on='name',right_on='son').drop('son',axis=1)

Unnamed: 0,birthyear,inauguration,name,father
0,1961,2009,Barack Obama,"Barack Obama, Sr."
1,1946,2001,George W. Bush,George H. W. Bush
2,1924,1989,George H. W. Bush,Prescott Bush


In [81]:
pd.merge(presidents,presidents_fathers,left_on='name',right_on='son',how='left').drop('son',axis=1)

Unnamed: 0,birthyear,inauguration,name,father
0,1961,2009,Barack Obama,"Barack Obama, Sr."
1,1946,2001,George W. Bush,George H. W. Bush
2,1946,1993,Bill Clinton,
3,1924,1989,George H. W. Bush,Prescott Bush


### multi-level dataframes

In [82]:
import seaborn as sns



In [127]:
flights = sns.load_dataset('flights')

In [93]:
flights.shape

(144, 3)

In [91]:
flights.describe()

Unnamed: 0,year,passengers
count,144.0,144.0
mean,1954.5,280.298611
std,3.464102,119.966317
min,1949.0,104.0
25%,1951.75,180.0
50%,1954.5,265.5
75%,1957.25,360.5
max,1960.0,622.0


In [128]:
flights_indexed = flights.set_index(['year','month'])

In [129]:
flights_indexed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,passengers
year,month,Unnamed: 2_level_1
1949,January,112
1949,February,118
1949,March,132
1949,April,129
1949,May,121


In [130]:
flights_indexed.loc[1950,'May']

passengers    125
Name: (1950, May), dtype: int64

In [131]:
flights_indexed.loc[1950].loc['January':'June']

Unnamed: 0_level_0,passengers
month,Unnamed: 1_level_1
January,115
February,126
March,141
April,135
May,125
June,149


In [132]:
flights_unstacked = flights_indexed.unstack('month')
flights_unstacked

Unnamed: 0_level_0,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers
month,April,August,December,February,January,July,June,March,May,November,October,September
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1949,129,148,118,118,112,148,135,132,121,104,119,136
1950,135,170,140,126,115,170,149,141,125,114,133,158
1951,163,199,166,150,145,199,178,178,172,146,162,184
1952,181,242,194,180,171,230,218,193,183,172,191,209
1953,235,272,201,196,196,264,243,236,229,180,211,237
1954,227,293,229,188,204,302,264,235,234,203,229,259
1955,269,347,278,233,242,364,315,267,270,237,274,312
1956,313,405,306,277,284,413,374,317,318,271,306,355
1957,348,467,336,301,315,465,422,356,355,305,347,404
1958,348,505,337,318,340,491,435,362,363,310,359,404


In [133]:
flights_unstacked.sum(axis=1)

year
1949    1520
1950    1676
1951    2042
1952    2364
1953    2700
1954    2867
1955    3408
1956    3939
1957    4421
1958    4572
1959    5140
1960    5714
dtype: int64

In [134]:
flights_unstacked['passengers','total'] = flights_unstacked.sum(axis=1)
flights_unstacked

Unnamed: 0_level_0,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers
month,April,August,December,February,January,July,June,March,May,November,October,September,total
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
1949,129,148,118,118,112,148,135,132,121,104,119,136,1520
1950,135,170,140,126,115,170,149,141,125,114,133,158,1676
1951,163,199,166,150,145,199,178,178,172,146,162,184,2042
1952,181,242,194,180,171,230,218,193,183,172,191,209,2364
1953,235,272,201,196,196,264,243,236,229,180,211,237,2700
1954,227,293,229,188,204,302,264,235,234,203,229,259,2867
1955,269,347,278,233,242,364,315,267,270,237,274,312,3408
1956,313,405,306,277,284,413,374,317,318,271,306,355,3939
1957,348,467,336,301,315,465,422,356,355,305,347,404,4421
1958,348,505,337,318,340,491,435,362,363,310,359,404,4572


In [135]:
flights_restacked = flights_unstacked.stack('month')

In [137]:
flights_restacked.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,passengers
year,month,Unnamed: 2_level_1
1949,April,129
1949,August,148
1949,December,118
1949,February,118
1949,January,112


In [138]:
flights_restacked.loc[pd.IndexSlice[:,'total'],'passengers']

year  month
1949  total    1520
1950  total    1676
1951  total    2042
1952  total    2364
1953  total    2700
1954  total    2867
1955  total    3408
1956  total    3939
1957  total    4421
1958  total    4572
1959  total    5140
1960  total    5714
Name: passengers, dtype: int64

In [140]:
flights_restacked[flights_restacked['passengers'] > 120].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,passengers
year,month,Unnamed: 2_level_1
1949,April,129
1949,August,148
1949,July,148
1949,June,135
1949,March,132
