In [1]:
# to install the pandas-datareader package, run:
# conda install -c https://conda.anaconda.org/anaconda pandas-datareader

In [2]:
%matplotlib inline
from IPython.core.display import HTML
from IPython.display import YouTubeVideo
from pandas_datareader import data, wb

import os
import pandas as pd
import numpy as np
import datetime

path1 = os.path.join(os.getcwd(),'style-table.css')
path2 = os.path.join(os.getcwd(),'style-notebook.css')

css = open(path1).read() + open(path2).read()
HTML('<style>{}</style>'.format(css))

### Pandas - series overview

In [2]:
# make a series with no index
obj = pd.Series([3,6,9,12])
obj

0     3
1     6
2     9
3    12
dtype: int64

In [3]:
obj.values

array([ 3,  6,  9, 12], dtype=int64)

In [4]:
obj.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [5]:
# make a series with an index
obj2 = pd.Series([8700000,4300000,3000000,2100000,400000],
           index=['USSR','Germany','China','Japan','USA'])
obj2

USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA         400000
dtype: int64

In [6]:
obj2['USSR']

8700000

In [7]:
# we can use array operations on series

In [8]:
# filter countries with over 4 mil casualties
obj2[obj2 > 4000000]

USSR       8700000
Germany    4300000
dtype: int64

In [9]:
#treat series as an ordered dictionary
'USSR' in obj2

True

In [10]:
'Finland' in obj2

False

In [11]:
#convert a pandas series into a dictionary
di = obj2.to_dict()
di

{'China': 3000000,
 'Germany': 4300000,
 'Japan': 2100000,
 'USA': 400000,
 'USSR': 8700000}

In [12]:
#convert a dictionary into a pandas series
obj3 = pd.Series(di)
obj3 = obj3.sort_values(ascending=False)
obj3

USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA         400000
dtype: int64

In [13]:
cl = ['China','Germany','Japan','USA','USSR','Argentina']

In [14]:
obj4 = pd.Series(di,index=cl)
obj4

China        3000000
Germany      4300000
Japan        2100000
USA           400000
USSR         8700000
Argentina        NaN
dtype: float64

In [15]:
pd.isnull(obj4)

China        False
Germany      False
Japan        False
USA          False
USSR         False
Argentina     True
dtype: bool

In [16]:
pd.notnull(obj4)

China         True
Germany       True
Japan         True
USA           True
USSR          True
Argentina    False
dtype: bool

In [17]:
obj2 + obj4 # add 2 series together

Argentina         NaN
China         6000000
Germany       8600000
Japan         4200000
USA            800000
USSR         17400000
dtype: float64

In [18]:
obj2.name = 'WW2 casualties' #label series
obj2.index.name = 'Countries' #label index column
obj2

Countries
USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA         400000
Name: WW2 casualties, dtype: int64

### Pandas - dataframe overview

In [19]:
# full list of methods of creating data frames:

# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html

In [20]:
import webbrowser

In [21]:
website='https://en.wikipedia.org/wiki/NFL_win%E2%80%93loss_records'
# open browser window with above link (uncomment below)
#webbrowser.open(website) 

In [22]:
#make a dataframe from clipboard
#nfl_frame = pd.read_clipboard()

In [23]:
# we will use a csv instead of clipboard, for reusability
nfl_frame = pd.DataFrame.from_csv('nfl_frame.csv')

In [24]:
nfl_frame.columns  #show column names

Index(['Team', 'Won', 'Lost', 'Tied*', 'Pct.', 'First Season', 'Total Games',
       'Conference'],
      dtype='object')

In [25]:
nfl_frame.Team #access a column (use . if it is 1 word)

Rank
1       Dallas Cowboys
2        Chicago Bears
3    Green Bay Packers
4       Miami Dolphins
5     Baltimore Ravens
Name: Team, dtype: object

In [26]:
#if column name has more than 1 word use brackets
nfl_frame['First Season'] 

Rank
1    1960
2    1920
3    1921
4    1966
5    1996
Name: First Season, dtype: int64

In [27]:
nfl_frame2 = nfl_frame[['Team','First Season','Total Games']]
nfl_frame2

Unnamed: 0_level_0,Team,First Season,Total Games
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Dallas Cowboys,1960,894
2,Chicago Bears,1920,1357
3,Green Bay Packers,1921,1339
4,Miami Dolphins,1966,792
5,Baltimore Ravens,1996,326


In [28]:
nfl_frame.head(1)

Unnamed: 0_level_0,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Dallas Cowboys,511,378,6,0.574,1960,894,NFC East


In [29]:
nfl_frame.tail(1)

Unnamed: 0_level_0,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North


In [30]:
# show all 3rd row values, indexed from 1
# might be better to use .loc or .iloc
nfl_frame.ix[3] 

Team            Green Bay Packers
Won                           741
Lost                          561
Tied*                          37
Pct.                        0.567
First Season                 1921
Total Games                  1339
Conference              NFC North
Name: 3, dtype: object

In [31]:
# fill a column with string
nfl_frame['Stadium'] = "Levi's Stadium" 
nfl_frame

Unnamed: 0_level_0,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference,Stadium
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Dallas Cowboys,511,378,6,0.574,1960,894,NFC East,Levi's Stadium
2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North,Levi's Stadium
3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North,Levi's Stadium
4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East,Levi's Stadium
5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North,Levi's Stadium


In [32]:
# fill column with 0-4
nfl_frame['Stadium'] = np.arange(5) 
nfl_frame

Unnamed: 0_level_0,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference,Stadium
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Dallas Cowboys,511,378,6,0.574,1960,894,NFC East,0
2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North,1
3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North,2
4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East,3
5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North,4


In [33]:
# make a series with 2 values and define indexes for them
stadiums = pd.Series(["Levi's Stadium","AT&T"],index=[4,0])
stadiums

4    Levi's Stadium
0              AT&T
dtype: object

In [34]:
# add this series to data frame
nfl_frame['Stadium'] = stadiums
nfl_frame

Unnamed: 0_level_0,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference,Stadium
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Dallas Cowboys,511,378,6,0.574,1960,894,NFC East,
2,Chicago Bears,752,563,42,0.57,1920,1357,NFC North,
3,Green Bay Packers,741,561,37,0.567,1921,1339,NFC North,
4,Miami Dolphins,443,345,4,0.562,1966,792,AFC East,Levi's Stadium
5,Baltimore Ravens,182,143,1,0.56,1996,326,AFC North,


In [35]:
#delete a column
del nfl_frame['Stadium']

In [36]:
nfl_frame.head(1)

Unnamed: 0_level_0,Team,Won,Lost,Tied*,Pct.,First Season,Total Games,Conference
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Dallas Cowboys,511,378,6,0.574,1960,894,NFC East


In [37]:
# create a dataframe from a dictionary
# keys become column labels

di = { 'City' : ['SF','LA','NYC'] , 'Pop' : [837000,380000,840000]}
city_frame = pd.DataFrame(di)
city_frame

Unnamed: 0,City,Pop
0,SF,837000
1,LA,380000
2,NYC,840000


### Pandas - indexing values in series

In [38]:
#create a series, with defined index labels
my_ser = pd.Series([1,2,3,4],index=['A','B','C','D'])
my_ser

A    1
B    2
C    3
D    4
dtype: int64

In [39]:
# get index labels and put them in a new object
my_index = my_ser.index 
my_index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [40]:
my_index[2] # show 3rd value of index

'C'

In [41]:
my_index[2:] # show 3rd value of index and everything after

Index(['C', 'D'], dtype='object')

In [42]:
# this will error because indexes are not mutable
# this makes dataframes more stable
# you can however re-index dataframes (next part)

#my_index[0] = 'Z'  

### Pandas - re-indexing values in series

In [43]:
# create series ser1, with index labels defined
ser1 = pd.Series([1,2,3,4],index=['A','B','C','D'])
ser1

A    1
B    2
C    3
D    4
dtype: int64

In [44]:
# re-index ser1 with more index values
# this will create auto-NaNs, because ser2 has no 5th/6th value

ser2 = ser1.reindex(['A','B','C','D','E','F'])
ser2

A     1
B     2
C     3
D     4
E   NaN
F   NaN
dtype: float64

In [45]:
# re-index, but with a default fill value for newly added NaNs
# old NaNs will stay intact

ser2.reindex(['A','B','C','D','E','F','G'],fill_value=0)

A     1
B     2
C     3
D     4
E   NaN
F   NaN
G     0
dtype: float64

In [46]:
ser3 = pd.Series(['USA','Mexico','Canada'],index=[0,5,10])
ser3

0        USA
5     Mexico
10    Canada
dtype: object

In [47]:
# forward fill ser3 with values from 0 to 14
# this puts together 0-14 with values from ser3 
# fills in the gaps between 0-5-10

ser3.reindex(range(15),method='ffill')

0        USA
1        USA
2        USA
3        USA
4        USA
5     Mexico
6     Mexico
7     Mexico
8     Mexico
9     Mexico
10    Canada
11    Canada
12    Canada
13    Canada
14    Canada
dtype: object

In [48]:
# Options for the method argument of reindex(i,method=)

# {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}

#  Method to use for filling holes in reindexed DataFrame:
#* default:      don't fill gaps
#* pad / ffill:  propagate last valid observation forward to next valid
#* backfill / bfill:   use next valid observation to fill gap
#* nearest:     use nearest valid observations to fill gap

### Pandas - re-indexing values of dataframes

In [49]:
# make dataframe from array of 25 random numbers
# reshape the array into 5x5
# add names to the columns

dframe = pd.DataFrame(np.random.randn(25).reshape((5,5)),
                     index=['A','B','D','E','F'],
                     columns=['col1','col2','col3','col4','col5'])
dframe

Unnamed: 0,col1,col2,col3,col4,col5
A,0.202346,-0.230593,-0.081465,-0.121477,-0.681114
B,-0.112069,-1.349823,1.029314,0.692625,0.576252
D,1.116772,-0.080037,1.434579,0.316476,0.223614
E,-0.587972,0.430111,0.935182,0.239948,-0.652586
F,1.338595,0.393606,-2.038913,-0.097784,-1.345371


In [50]:
# re-do index of dframe2, C becomes a new column of NaNs
dframe2 = dframe.reindex(['A','B','C','D','E','F'])
dframe2

Unnamed: 0,col1,col2,col3,col4,col5
A,0.202346,-0.230593,-0.081465,-0.121477,-0.681114
B,-0.112069,-1.349823,1.029314,0.692625,0.576252
C,,,,,
D,1.116772,-0.080037,1.434579,0.316476,0.223614
E,-0.587972,0.430111,0.935182,0.239948,-0.652586
F,1.338595,0.393606,-2.038913,-0.097784,-1.345371


In [51]:
# re-index the columns in dframe2, col6 will be all NaNs

new_columns = ['col1','col2','col3','col4','col5','col6']

dframe2 = dframe2.reindex(columns=new_columns)
dframe2

Unnamed: 0,col1,col2,col3,col4,col5,col6
A,0.202346,-0.230593,-0.081465,-0.121477,-0.681114,
B,-0.112069,-1.349823,1.029314,0.692625,0.576252,
C,,,,,,
D,1.116772,-0.080037,1.434579,0.316476,0.223614,
E,-0.587972,0.430111,0.935182,0.239948,-0.652586,
F,1.338595,0.393606,-2.038913,-0.097784,-1.345371,


In [52]:
# alternively we can fast reindex columns using .ix[]

dframe.ix[['A','B','C','D','E','F'],new_columns]

Unnamed: 0,col1,col2,col3,col4,col5,col6
A,0.202346,-0.230593,-0.081465,-0.121477,-0.681114,
B,-0.112069,-1.349823,1.029314,0.692625,0.576252,
C,,,,,,
D,1.116772,-0.080037,1.434579,0.316476,0.223614,
E,-0.587972,0.430111,0.935182,0.239948,-0.652586,
F,1.338595,0.393606,-2.038913,-0.097784,-1.345371,


### Pandas - dropping values from a series

In [53]:
#make a series of range(3)
#label indexes

ser1 = pd.Series(np.arange(3),index=['a','b','c'])
ser1

a    0
b    1
c    2
dtype: int32

In [54]:
ser1.drop('b')

a    0
c    2
dtype: int32

### Pandas - dropping rows and columns in dataframes

In [55]:
#make a df with 9 values, 0 to 8
#reshape df to 3x3
#label index
#label columns

df1 = pd.DataFrame(np.arange(9).reshape([3,3]),
                   index=['SF','LA','NY'],
                   columns=['pop','size','year'])
df1

Unnamed: 0,pop,size,year
SF,0,1,2
LA,3,4,5
NY,6,7,8


In [56]:
#drop a row
df1 = df1.drop('LA')
df1

Unnamed: 0,pop,size,year
SF,0,1,2
NY,6,7,8


In [57]:
# drop a column 
# rows ares axis=0 (0 by default) 
# columns are axis=1

df1 = df1.drop('year', axis=1)
df1

Unnamed: 0,pop,size
SF,0,1
NY,6,7


In [58]:
#another syntax for deleting a column

del df1['size']
df1

Unnamed: 0,pop
SF,0
NY,6


### Pandas - selecting and filtering values in a series

In [59]:
#create a series of 3 values, from 0 to 2
#add labels for index
#multiply values of series times 2

ser3 = pd.Series(np.arange(3),index=['a','b','c'])
ser3 = ser3*2
ser3

a    0
b    2
c    4
dtype: int32

In [60]:
#select 2nd value by label/name of index

ser3['b']

2

In [61]:
#select 2nd value by count of index

ser3[1]

2

In [62]:
#select 2nd and 3rd value by the label/name of index

ser3[['a','b']]

a    0
b    2
dtype: int32

In [63]:
# select all values from ser3 larger than 3

ser3[ser3 > 3]

c    4
dtype: int32

In [64]:
# using selection, we can also set values
# set all values higher than 3 to be 10

ser3[ser3 > 3] = 10
ser3

a     0
b     2
c    10
dtype: int32

### Pandas - selecting and filtering rows and columns in a dataframe

In [66]:
# create a dataframe from np array of 25 entries
# reshape array into a 5x5 matrix
# define index labels from list of cities
# define column labels from list of A,B,C,D,E

dframe = pd.DataFrame(np.arange(25).reshape((5,5)),
                      index=['NYC','LA','SF','DC','CHI'],
                      columns=['A','B','C','D','E'])
dframe

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
LA,5,6,7,8,9
SF,10,11,12,13,14
DC,15,16,17,18,19
CHI,20,21,22,23,24


In [67]:
# select a column, by label

dframe['B']

NYC     1
LA      6
SF     11
DC     16
CHI    21
Name: B, dtype: int32

In [68]:
# select a column, by label, alternate syntax
# can be used if column name has no spaces

dframe.B

NYC     1
LA      6
SF     11
DC     16
CHI    21
Name: B, dtype: int32

In [110]:
# select 2 columns, by label

dframe[['B','E']]

Unnamed: 0,B,E
NYC,1,4
LA,6,9
SF,11,14
DC,16,19
CHI,21,24


In [70]:
# select every row where column C is greater than 8
# refer to column C by label

dframe[dframe['C'] > 8]

Unnamed: 0,A,B,C,D,E
SF,10,11,12,13,14
DC,15,16,17,18,19
CHI,20,21,22,23,24


In [71]:
# select every row where column C is greater than 8
# refer to column C by label
# alt syntax

dframe[dframe.C > 8]

Unnamed: 0,A,B,C,D,E
SF,10,11,12,13,14
DC,15,16,17,18,19
CHI,20,21,22,23,24


In [112]:
# make boolean dataframe of every cell with value above 10

dframe > 10

Unnamed: 0,A,B,C,D,E
NYC,False,False,False,False,False
LA,False,False,False,False,False
SF,False,True,True,True,True
DC,True,True,True,True,True
CHI,True,True,True,True,True


In [72]:
# select an index (row) by label 

dframe.ix['LA']

A    5
B    6
C    7
D    8
E    9
Name: LA, dtype: int32

In [114]:
# select an index (row) by index value

dframe.ix[1] 

A    5
B    6
C    7
D    8
E    9
Name: LA, dtype: int32

### Pandas - adding two series of different lengths

In [115]:
# make 2 series, one longer than the other
# label indexes with letters

ser1 = pd.Series([0,1,2],index=['A','B','C'])
ser2 = pd.Series([3,4,5,6],index=['A','B','C','D'])

In [116]:
# we add the two series
# we get a null for the 4th value of ser2

ser1 + ser2

A     3
B     5
C     7
D   NaN
dtype: float64

### Pandas - adding two dataframes of different sizes

In [79]:
# make first dataframe from np array of 4 values, 0 to 3
# reshape np array to 2x2
# give labels to columns, from 'AB' converted to list
# give labels to index

dframe1 = pd.DataFrame(np.arange(4).reshape((2,2)),
                      columns=list('AB'),
                      index=['NYC','LA'])
dframe1

Unnamed: 0,A,B
NYC,0,1
LA,2,3


In [80]:
# make second dataframe from np array of 9 values, 0 to 8
# reshape np array to 3x3
# give labels to columns, from 'ADC' converted to list
# give labels to index

dframe2 = pd.DataFrame(np.arange(9).reshape((3,3)),
                      columns=list('ADC'),
                      index=['NYC','LA','SF'])
dframe2

Unnamed: 0,A,D,C
NYC,0,1,2
LA,3,4,5
SF,6,7,8


In [81]:
# add the 2 dataframes together
# NaN + any other type = NaN
# we get a NaN for each int + NaN scenario

dframe1 + dframe2

Unnamed: 0,A,B,C,D
LA,5.0,,,
NYC,0.0,,,
SF,,,,


In [82]:
# we can use the .add() function to fill 0's for NaNs
# every NaN + int becomes 0 + int now
# we avoid the int + NaN scenarios
# only NaN + NaN will stay as NaN

dframe1.add(dframe2,fill_value=0)

Unnamed: 0,A,B,C,D
LA,5,3.0,5,4
NYC,0,1.0,2,1
SF,6,,8,7


### Pandas - adding a series to a dataframe

In [90]:
# create ser3 from 0th row of dframe2

ser3 = dframe2.ix[0]
ser3

A    0
D    1
C    2
Name: NYC, dtype: int32

In [92]:
dframe2 - ser3

Unnamed: 0,A,D,C
NYC,0,0,0
LA,3,3,3
SF,6,6,6


### Pandas - sorting a series by its index

In [93]:
ser1 = pd.Series(range(3),
                index=['C','A','B'])
ser1

C    0
A    1
B    2
dtype: int32

In [94]:
ser1.sort_index()

A    1
B    2
C    0
dtype: int32

### Pandas - sorting a series by its values

In [95]:
ser2 = pd.Series(range(3),
                index=['C','A','B'])
ser2

C    0
A    1
B    2
dtype: int32

In [97]:
ser2.sort_values()

C    0
A    1
B    2
dtype: int32

### Pandas - ranking a series

In [105]:
# make series with 10 random numbers
# sort it according to values

ser3 = pd.Series(np.random.randn(10))
ser3.sort_values()

3   -2.289520
4   -1.191415
7   -0.377753
1   -0.143098
6   -0.048558
0    0.101485
5    0.480461
8    1.083718
2    1.297420
9    2.848786
dtype: float64

In [108]:
# sort_values() actually sorts the rank of each value
# use rank() to show only the ranks

ser3.rank()

0     6
1     4
2     9
3     1
4     2
5     7
6     5
7     3
8     8
9    10
dtype: float64

In [111]:
# re-run to get more random values (thus random ranks)

pd.Series(np.random.randn(10)).rank()

0     3
1     7
2     9
3     5
4     6
5     2
6     8
7     4
8     1
9    10
dtype: float64

### Pandas - max, min and sum for dataframes

In [5]:
# make a 2x3 matrix with 2 NaNs

arr = np.array([[1,2,np.nan],[np.nan,3,4]])
arr

array([[  1.,   2.,  nan],
       [ nan,   3.,   4.]])

In [6]:
# make a dataframe from arr

dframe1 = pd.DataFrame(arr,
                      index=['A','B'],
                      columns=['one','two','three'])
dframe1

Unnamed: 0,one,two,three
A,1.0,2,
B,,3,4.0


In [7]:
# sum up values of each column
# will ignore NaNs (null values) 
# will treat NaNs as 0s

dframe1.sum()

one      1
two      5
three    4
dtype: float64

In [8]:
# sum up values of each row
# will ignore NaNs (null values) 
# will treat NaNs as 0s

dframe1.sum(axis=1)

A    3
B    7
dtype: float64

In [9]:
dframe1

Unnamed: 0,one,two,three
A,1.0,2,
B,,3,4.0


In [24]:
# show each column's minimum value
# will ignore NaNs

dframe1.min()

one      1
two      2
three    4
dtype: float64

In [25]:
# show index label of each column's minimum value 

dframe1.idxmin()

one      A
two      A
three    B
dtype: object

In [27]:
# cumulative sum

dframe1.cumsum()

Unnamed: 0,one,two,three
A,1.0,2,
B,,5,4.0


### Pandas - summary statistics (via describe)

In [27]:
# get useful info using describe
# show count, mean, std dev, min, max

dframe1 = pd.DataFrame(np.array([[1,2,np.nan],[np.nan,3,4]]),
                      index=['A','B'],
                      columns=['one','two','three'])

dframe1.describe()

Unnamed: 0,one,two,three
count,1.0,2.0,1.0
mean,1.0,2.5,4.0
std,,0.707107,
min,1.0,2.0,4.0
25%,1.0,2.25,4.0
50%,1.0,2.5,4.0
75%,1.0,2.75,4.0
max,1.0,3.0,4.0


### Pandas - covariance and correlation

In [1]:
# Explanation of covariance
# YouTubeVideo('xGbpuFNR1ME')

In [2]:
# Explanation of correlation
# YouTubeVideo('4EXNedimDMs')

In [3]:
# Download closing price info of 3 stocks from Yahoo
# into dataframe

prices = data.get_data_yahoo(['CVX','XOM','BP'],
                             start=datetime.datetime(2010,1,1),
                             end=datetime.datetime(2015,12,30),
                            )['Adj Close']
prices.tail(3)

Unnamed: 0_level_0,BP,CVX,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-12-22,30.790001,90.269997,77.650002
2015-12-23,32.349998,93.809998,80.190002
2015-12-24,32.119999,92.050003,79.330002
2015-12-28,31.709999,90.360001,78.739998
2015-12-29,31.870001,91.25,79.160004


In [4]:
# get dataframe dimensions

prices.shape

(1508, 3)

In [5]:
# Download volume info of 3 stocks from Yahoo
# into dataframe

volume = data.get_data_yahoo(['CVX','XOM','BP'],
                             start=datetime.datetime(2010,1,1),
                             end=datetime.datetime(2015,12,30),
                            )['Volume']
volume.tail(3)

Unnamed: 0_level_0,BP,CVX,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-12-22,7186200,9516000,13624300
2015-12-23,10106200,10895500,14963900
2015-12-24,3381000,4998700,5848300
2015-12-28,6164500,6696200,9563100
2015-12-29,7046100,6244500,8817800


In [13]:
# create a dataframe of returns on stock
# pct_change() 

rets = prices.pct_change()
rets.tail(3)

Unnamed: 0_level_0,BP,CVX,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-12-24,-0.00711,-0.018761,-0.010725
2015-12-28,-0.012765,-0.01836,-0.007437
2015-12-29,0.005046,0.009849,0.005334


In [18]:
# correlation of the stocks

corr = rets.corr

### Pandas - unique values in a series

In [20]:
ser1 = pd.Series(['w','w','x','z','t','w','w','a','x'])
ser1

0    w
1    w
2    x
3    z
4    t
5    w
6    w
7    a
8    x
dtype: object

In [28]:
# get unqiue values

ser1.unique()

array(['w', 'x', 'z', 't', 'a'], dtype=object)

In [23]:
# group counts into unique categories

ser1.value_counts()

w    4
x    2
t    1
z    1
a    1
dtype: int64

### Pandas - dealing with missing data in series

In [3]:
data = pd.Series(['one','two',np.nan,'four'])
data

0     one
1     two
2     NaN
3    four
dtype: object

In [4]:
data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
data = data.dropna()

In [8]:
data

0     one
1     two
3    four
dtype: object

### Pandas - deleting null values in dataframes

In [2]:
#make a dataframe with NaNs in all but one row

dframe = pd.DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[3*np.nan]])
dframe

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [3]:
# ropna() will delete all rows containing 1 or more NaNs

clean_dframe = dframe.dropna()
clean_dframe

Unnamed: 0,0,1,2
0,1,2,3


In [4]:
# dropna() with how='all'
# will only delete the rows with all NaNs

clean_dframe2 = dframe.dropna(how='all')
clean_dframe2

Unnamed: 0,0,1,2
0,1.0,2.0,3
1,,5.0,6
2,7.0,,9


In [5]:
# dropna() with axis=1
# will drop columns with 1 or more NaNs, instead of rows

clean_dframe3 = dframe.dropna(axis=1)
clean_dframe3

0
1
2
3


In [8]:
# make a placeholder NaN variable
nan = np.nan

# make a new dataframe with null values
dframe2 = pd.DataFrame([[1,2,3,nan],
                       [2,nan,5,6],
                       [nan,7,nan,9],
                       [1,nan,nan,nan]])
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [10]:
# drop rows with 2 or less data points (non-null values)
dframe2.dropna(thresh=2)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0


In [12]:
# drop rows with 3 or less data points (non-null values)
dframe2.dropna(thresh=3)

Unnamed: 0,0,1,2,3
0,1,2.0,3,
1,2,,5,6.0


In [13]:
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


### Pandas - filling null values in dataframes

In [28]:
# make a placeholder NaN variable

nan = np.nan

# make a new dataframe with null values

dframe2 = pd.DataFrame([[1,2,3,nan],
                       [2,nan,5,6],
                       [nan,7,nan,9],
                       [1,nan,nan,nan]])
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [29]:
# fill all null values with a 1

dframe2.fillna(1)

Unnamed: 0,0,1,2,3
0,1,2,3,1
1,2,1,5,6
2,1,7,1,9
3,1,1,1,1


In [30]:
# fill different NaNs with different values
# define the values via a dictionary

rules = {0:0,1:1,2:2,3:3}

dframe2.fillna(rules)

Unnamed: 0,0,1,2,3
0,1,2,3,3
1,2,1,5,6
2,0,7,2,9
3,1,1,2,3


In [None]:
# use inplace=True to apply the change directly to the dataframe

dframe2.fillna(rules,inplace=True)

### Pandas - series with multiple indexes

In [31]:
# create a series with more than 1 index list
# pandas will create a hierarchy of indexes

ser = pd.Series(np.random.randn(6), 
                index = [[1,1,1,2,2,2],['a','b','c','a','b','c']])
ser

1  a   -0.070793
   b    1.432053
   c   -0.106779
2  a    1.899275
   b   -0.794825
   c   -2.431931
dtype: float64

In [41]:
# show the index levels

ser.index

MultiIndex(levels=[[1, 2], ['a', 'b', 'c']],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [40]:
# show everything with index label 1

ser[1]

a   -0.070793
b    1.432053
c   -0.106779
dtype: float64

In [35]:
# show everything with index label 2

ser[2]

a    1.899275
b   -0.794825
c   -2.431931
dtype: float64

In [43]:
# show everything from first index 
# with an 'a' as second index label

ser[:,'a']

1   -0.070793
2    1.899275
dtype: float64

In [44]:
# unstack the first index level of series upward
# this will create a dataframe

dframe = ser.unstack()
dframe

Unnamed: 0,a,b,c
1,-0.070793,1.432053,-0.106779
2,1.899275,-0.794825,-2.431931


### Pandas - multiple indexes and hierarchy in dataframes

In [7]:
# create a dataframe with 2 row indexes and 2 column indexes

dframe2 = pd.DataFrame(np.arange(16).reshape(4,4),
                       index=[['a','a','b','b'],[1,2,1,2]],
                       columns=[['NY','NY','LA','SF'],['cold','hot','hot','cold']])
dframe2

Unnamed: 0_level_0,Unnamed: 1_level_0,NY,NY,LA,SF
Unnamed: 0_level_1,Unnamed: 1_level_1,cold,hot,hot,cold
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [None]:
# lookup index names 
# will return empty list because indexes are unnamed

dframe2.index.names

In [9]:
# give names to row indexes and column indexes

dframe2.index.names = ['INDEX_1','INDEX_2']
dframe2.columns.names = ['Cities','Temp']
dframe2

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [10]:
dframe2.index.names

FrozenList(['INDEX_1', 'INDEX_2'])

In [15]:
# swap order of 2 column names with each other
# Temp will now be on top
# axis=1 means we are working with columns

dframe2.swaplevel('Cities','Temp',axis=1)

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [25]:
# sort row indexes by sub-level 0 (by INDEX_1)

dframe2.sortlevel(0)

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [26]:
# sort row indexes by sub-level 1 (by INDEX_2)

dframe2.sortlevel(1)

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
INDEX_1,INDEX_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
b,1,8,9,10,11
a,2,4,5,6,7
b,2,12,13,14,15


In [29]:
# sum values by the 'Temp' column index
# use axis=1 because we are working with the columns axis

dframe2.sum(level='Temp',axis=1)

Unnamed: 0_level_0,Temp,cold,hot
INDEX_1,INDEX_2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,3,3
a,2,11,11
b,1,19,19
b,2,27,27


### quick datetime examples

In [None]:
pd.date_range(start=datetime.datetime(2000,1,1),end=datetime.datetime(2000,1,12))

In [None]:
pd.date_range(start=datetime.datetime(2000,1,1),end=datetime.datetime(2000,1,12)).tolist()