# Dataframes and Pandas

## 0.0 Import Data. This sheet will use Pennsylvania 2012 election results.

In [6]:
import pandas as pd
election = pd.read_csv('election_penn_2012.csv', index_col='county')
election.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adams,PA,41973,35.482334,63.112001,Romney,61156,68.632677,27.629667
Allegheny,PA,614671,56.640219,42.18582,Obama,924351,66.497575,14.454399
Armstrong,PA,28322,30.696985,67.901278,Romney,42147,67.19814,37.204293
Beaver,PA,80015,46.032619,52.63763,Romney,115157,69.483401,6.605012
Bedford,PA,21444,22.057452,76.98657,Romney,32189,66.619031,54.929118


## 1.0 Indexing

loc vs iloc

## 2.0 Slicing Dataframes

You can slice rows and column by their named values with "loc."
You can slice rows and column by their integer values with "iloc."

In [9]:
# selecting columns with loc:
left_columns = election.loc[:,'state':'Obama']
left_columns # using {all rows, columns from state to Obama columns}

Unnamed: 0_level_0,state,total,Obama
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adams,PA,41973,35.482334
Allegheny,PA,614671,56.640219
Armstrong,PA,28322,30.696985
Beaver,PA,80015,46.032619
Bedford,PA,21444,22.057452
...,...,...,...
Washington,PA,90078,42.744066
Wayne,PA,20966,38.815225
Westmoreland,PA,168709,37.567646
Wyoming,PA,11214,42.910647


In [10]:
# selecting the middle columns
middle_columns = election.loc[:,'Obama':'winner']
middle_columns # using {all rows, columns from Obama to winner columns}

Unnamed: 0_level_0,Obama,Romney,winner
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adams,35.482334,63.112001,Romney
Allegheny,56.640219,42.185820,Obama
Armstrong,30.696985,67.901278,Romney
Beaver,46.032619,52.637630,Romney
Bedford,22.057452,76.986570,Romney
...,...,...,...
Washington,42.744066,56.012567,Romney
Wayne,38.815225,59.768196,Romney
Westmoreland,37.567646,61.306154,Romney
Wyoming,42.910647,55.189941,Romney


In [11]:
right_columns = election.loc[:,'Romney':'voters']
right_columns # using {all rows, columns from Romney to voters columns}

Unnamed: 0_level_0,Romney,winner,voters
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adams,63.112001,Romney,61156
Allegheny,42.185820,Obama,924351
Armstrong,67.901278,Romney,42147
Beaver,52.637630,Romney,115157
Bedford,76.986570,Romney,32189
...,...,...,...
Washington,56.012567,Romney,142331
Wayne,59.768196,Romney,32577
Westmoreland,61.306154,Romney,238006
Wyoming,55.189941,Romney,17255


In [18]:
# subselection of cells within the dataframe
# Create the list of row labels: rows
rows = ['Philadelphia', 'Centre', 'Fulton']

# Create the list of column labels: cols
cols = ['winner','Obama','Romney']

# Create the new DataFrame: three_counties
# the new dataframe consists of the two lists separated by a comma (as opposed to string
# column names when not using lists)
three_counties = election.loc[rows,cols]

# Print the three_counties DataFrame
print(three_counties)

              winner      Obama     Romney
county                                    
Philadelphia   Obama  85.224251  14.051451
Centre        Romney  48.948416  48.977486
Fulton        Romney  21.096291  77.748861


## 3.0 Filtering Dataframes

In [20]:
election.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adams,PA,41973,35.482334,63.112001,Romney,61156,68.632677,27.629667
Allegheny,PA,614671,56.640219,42.18582,Obama,924351,66.497575,14.454399
Armstrong,PA,28322,30.696985,67.901278,Romney,42147,67.19814,37.204293
Beaver,PA,80015,46.032619,52.63763,Romney,115157,69.483401,6.605012
Bedford,PA,21444,22.057452,76.98657,Romney,32189,66.619031,54.929118


In [22]:
# create the boolean array (series is a better term): high_turnout
high_turnout = election.turnout >= 70
high_turnout

county
Adams           False
Allegheny       False
Armstrong       False
Beaver          False
Bedford         False
                ...  
Washington      False
Wayne           False
Westmoreland     True
Wyoming         False
York            False
Name: turnout, Length: 67, dtype: bool

In [23]:
# apply the series to the original dataframe
election[high_turnout]
# the output shows the county with greater than 70% turnout

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bucks,PA,319407,49.96697,48.801686,Obama,435606,73.324748,1.165284
Butler,PA,88924,31.920516,66.816607,Romney,122762,72.436096,34.896091
Chester,PA,248295,49.228539,49.650617,Romney,337822,73.498766,0.422079
Forest,PA,2308,38.734835,59.835355,Romney,3232,71.410891,21.10052
Franklin,PA,62802,30.110506,68.583803,Romney,87406,71.850903,38.473297
Montgomery,PA,401787,56.637223,42.286834,Obama,551105,72.905708,14.35039
Westmoreland,PA,168709,37.567646,61.306154,Romney,238006,70.884347,23.738508


In [24]:
# Import numpy
import numpy as np

# Create the boolean array: too_close
too_close = election.margin < 1

In [27]:
too_close

county
Adams           False
Allegheny       False
Armstrong       False
Beaver          False
Bedford         False
                ...  
Washington      False
Wayne           False
Westmoreland    False
Wyoming         False
York            False
Name: margin, Length: 67, dtype: bool

In [28]:
# Assign np.nan to the 'winner' column where the results were too close to call
election.winner[too_close] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [29]:
election

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adams,PA,41973.0,35.482334,63.112001,Romney,61156.0,68.632677,27.629667
Allegheny,PA,614671.0,56.640219,42.185820,Obama,924351.0,66.497575,14.454399
Armstrong,PA,28322.0,30.696985,67.901278,Romney,42147.0,67.198140,37.204293
Beaver,PA,80015.0,46.032619,52.637630,Romney,115157.0,69.483401,6.605012
Bedford,PA,21444.0,22.057452,76.986570,Romney,32189.0,66.619031,54.929118
...,...,...,...,...,...,...,...,...
Washington,PA,90078.0,42.744066,56.012567,Romney,142331.0,63.287689,13.268501
Wayne,PA,20966.0,38.815225,59.768196,Romney,32577.0,64.358290,20.952971
Westmoreland,PA,168709.0,37.567646,61.306154,Romney,238006.0,70.884347,23.738508
Wyoming,PA,11214.0,42.910647,55.189941,Romney,17255.0,64.989858,12.279294


In [36]:
election[['state','total']]

Unnamed: 0_level_0,state,total
county,Unnamed: 1_level_1,Unnamed: 2_level_1
Adams,PA,41973.0
Allegheny,PA,614671.0
Armstrong,PA,28322.0
Beaver,PA,80015.0
Bedford,PA,21444.0
...,...,...
Washington,PA,90078.0
Wayne,PA,20966.0
Westmoreland,PA,168709.0
Wyoming,PA,11214.0


## 4.0 Transforming Dataframes

In [37]:
# transforming a column with "apply"
# The .apply() method can be used on a pandas DataFrame
# to apply an arbitrary Python function to every element. 

# use the .apply method to apply a functions output across a column
# FUTURE - create some examples for this dataset.

# use the .map method to transform values based on a dictionary lookup
# Note: both use expensive FOR loops behind the scenes to make the computations

In [40]:
# Create the dictionary: red_vs_blue
red_vs_blue = {'Obama':'Blue', 'Romney':'Red'}

In [41]:
red_vs_blue

{'Obama': 'Blue', 'Romney': 'Red'}

In [42]:
election['color'] = election.winner.map(red_vs_blue)

In [43]:
election
# now there is a new column with our values

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin,color
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Adams,PA,41973.0,35.482334,63.112001,Romney,61156.0,68.632677,27.629667,Red
Allegheny,PA,614671.0,56.640219,42.185820,Obama,924351.0,66.497575,14.454399,Blue
Armstrong,PA,28322.0,30.696985,67.901278,Romney,42147.0,67.198140,37.204293,Red
Beaver,PA,80015.0,46.032619,52.637630,Romney,115157.0,69.483401,6.605012,Red
Bedford,PA,21444.0,22.057452,76.986570,Romney,32189.0,66.619031,54.929118,Red
...,...,...,...,...,...,...,...,...,...
Washington,PA,90078.0,42.744066,56.012567,Romney,142331.0,63.287689,13.268501,Red
Wayne,PA,20966.0,38.815225,59.768196,Romney,32577.0,64.358290,20.952971,Red
Westmoreland,PA,168709.0,37.567646,61.306154,Romney,238006.0,70.884347,23.738508,Red
Wyoming,PA,11214.0,42.910647,55.189941,Romney,17255.0,64.989858,12.279294,Red


In [44]:
# vectorizing functions - these do computation as compile code speeds.
# these consist of UFUNCS (Universal Functions in Numpy)

In [45]:
# Import zscore from scipy.stats
from scipy.stats import zscore

In [51]:
turnout_zscore = zscore(election['turnout'])
turnout_zscore # there is some weird error here..

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan])

In [52]:
# Assign turnout_zscore to a new column: election['turnout_zscore']
election['turnout_zscore'] = turnout_zscore

In [50]:
election.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin,color,turnout_zscore
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Adams,PA,41973.0,35.482334,63.112001,Romney,61156.0,68.632677,27.629667,Red,
Allegheny,PA,614671.0,56.640219,42.18582,Obama,924351.0,66.497575,14.454399,Blue,
Armstrong,PA,28322.0,30.696985,67.901278,Romney,42147.0,67.19814,37.204293,Red,
Beaver,PA,80015.0,46.032619,52.63763,Romney,115157.0,69.483401,6.605012,Red,
Bedford,PA,21444.0,22.057452,76.98657,Romney,32189.0,66.619031,54.929118,Red,
