# Dataframes and Pandas

## 0.0 Import Data. This sheet will use Pennsylvania 2012 election results.

In [6]:
import pandas as pd
election = pd.read_csv('election_penn_2012.csv', index_col='county')
election.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adams,PA,41973,35.482334,63.112001,Romney,61156,68.632677,27.629667
Allegheny,PA,614671,56.640219,42.18582,Obama,924351,66.497575,14.454399
Armstrong,PA,28322,30.696985,67.901278,Romney,42147,67.19814,37.204293
Beaver,PA,80015,46.032619,52.63763,Romney,115157,69.483401,6.605012
Bedford,PA,21444,22.057452,76.98657,Romney,32189,66.619031,54.929118


## 1.0 Indexing

loc vs iloc

## 2.0 Slicing Dataframes

You can slice rows and column by their named values with "loc."
You can slice rows and column by their integer values with "iloc."

In [9]:
# selecting columns with loc:
left_columns = election.loc[:,'state':'Obama']
left_columns # using {all rows, columns from state to Obama columns}

Unnamed: 0_level_0,state,total,Obama
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adams,PA,41973,35.482334
Allegheny,PA,614671,56.640219
Armstrong,PA,28322,30.696985
Beaver,PA,80015,46.032619
Bedford,PA,21444,22.057452
...,...,...,...
Washington,PA,90078,42.744066
Wayne,PA,20966,38.815225
Westmoreland,PA,168709,37.567646
Wyoming,PA,11214,42.910647


In [10]:
# selecting the middle columns
middle_columns = election.loc[:,'Obama':'winner']
middle_columns # using {all rows, columns from Obama to winner columns}

Unnamed: 0_level_0,Obama,Romney,winner
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adams,35.482334,63.112001,Romney
Allegheny,56.640219,42.185820,Obama
Armstrong,30.696985,67.901278,Romney
Beaver,46.032619,52.637630,Romney
Bedford,22.057452,76.986570,Romney
...,...,...,...
Washington,42.744066,56.012567,Romney
Wayne,38.815225,59.768196,Romney
Westmoreland,37.567646,61.306154,Romney
Wyoming,42.910647,55.189941,Romney


In [11]:
right_columns = election.loc[:,'Romney':'voters']
right_columns # using {all rows, columns from Romney to voters columns}

Unnamed: 0_level_0,Romney,winner,voters
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adams,63.112001,Romney,61156
Allegheny,42.185820,Obama,924351
Armstrong,67.901278,Romney,42147
Beaver,52.637630,Romney,115157
Bedford,76.986570,Romney,32189
...,...,...,...
Washington,56.012567,Romney,142331
Wayne,59.768196,Romney,32577
Westmoreland,61.306154,Romney,238006
Wyoming,55.189941,Romney,17255


In [18]:
# subselection of cells within the dataframe
# Create the list of row labels: rows
rows = ['Philadelphia', 'Centre', 'Fulton']

# Create the list of column labels: cols
cols = ['winner','Obama','Romney']

# Create the new DataFrame: three_counties
# the new dataframe consists of the two lists separated by a comma (as opposed to string
# column names when not using lists)
three_counties = election.loc[rows,cols]

# Print the three_counties DataFrame
print(three_counties)

              winner      Obama     Romney
county                                    
Philadelphia   Obama  85.224251  14.051451
Centre        Romney  48.948416  48.977486
Fulton        Romney  21.096291  77.748861


## 3.0 Filtering Dataframes

In [20]:
election.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adams,PA,41973,35.482334,63.112001,Romney,61156,68.632677,27.629667
Allegheny,PA,614671,56.640219,42.18582,Obama,924351,66.497575,14.454399
Armstrong,PA,28322,30.696985,67.901278,Romney,42147,67.19814,37.204293
Beaver,PA,80015,46.032619,52.63763,Romney,115157,69.483401,6.605012
Bedford,PA,21444,22.057452,76.98657,Romney,32189,66.619031,54.929118


In [22]:
# create the boolean array (series is a better term): high_turnout
high_turnout = election.turnout >= 70
high_turnout

county
Adams           False
Allegheny       False
Armstrong       False
Beaver          False
Bedford         False
                ...  
Washington      False
Wayne           False
Westmoreland     True
Wyoming         False
York            False
Name: turnout, Length: 67, dtype: bool

In [23]:
# apply the series to the original dataframe
election[high_turnout]
# the output shows the county with greater than 70% turnout

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bucks,PA,319407,49.96697,48.801686,Obama,435606,73.324748,1.165284
Butler,PA,88924,31.920516,66.816607,Romney,122762,72.436096,34.896091
Chester,PA,248295,49.228539,49.650617,Romney,337822,73.498766,0.422079
Forest,PA,2308,38.734835,59.835355,Romney,3232,71.410891,21.10052
Franklin,PA,62802,30.110506,68.583803,Romney,87406,71.850903,38.473297
Montgomery,PA,401787,56.637223,42.286834,Obama,551105,72.905708,14.35039
Westmoreland,PA,168709,37.567646,61.306154,Romney,238006,70.884347,23.738508


In [24]:
# Import numpy
import numpy as np

# Create the boolean array: too_close
too_close = election.margin < 1

In [27]:
too_close

county
Adams           False
Allegheny       False
Armstrong       False
Beaver          False
Bedford         False
                ...  
Washington      False
Wayne           False
Westmoreland    False
Wyoming         False
York            False
Name: margin, Length: 67, dtype: bool

In [28]:
# Assign np.nan to the 'winner' column where the results were too close to call
election.winner[too_close] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [29]:
election

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adams,PA,41973.0,35.482334,63.112001,Romney,61156.0,68.632677,27.629667
Allegheny,PA,614671.0,56.640219,42.185820,Obama,924351.0,66.497575,14.454399
Armstrong,PA,28322.0,30.696985,67.901278,Romney,42147.0,67.198140,37.204293
Beaver,PA,80015.0,46.032619,52.637630,Romney,115157.0,69.483401,6.605012
Bedford,PA,21444.0,22.057452,76.986570,Romney,32189.0,66.619031,54.929118
...,...,...,...,...,...,...,...,...
Washington,PA,90078.0,42.744066,56.012567,Romney,142331.0,63.287689,13.268501
Wayne,PA,20966.0,38.815225,59.768196,Romney,32577.0,64.358290,20.952971
Westmoreland,PA,168709.0,37.567646,61.306154,Romney,238006.0,70.884347,23.738508
Wyoming,PA,11214.0,42.910647,55.189941,Romney,17255.0,64.989858,12.279294


In [36]:
election[['state','total']]

Unnamed: 0_level_0,state,total
county,Unnamed: 1_level_1,Unnamed: 2_level_1
Adams,PA,41973.0
Allegheny,PA,614671.0
Armstrong,PA,28322.0
Beaver,PA,80015.0
Bedford,PA,21444.0
...,...,...
Washington,PA,90078.0
Wayne,PA,20966.0
Westmoreland,PA,168709.0
Wyoming,PA,11214.0


## 4.0 Transforming Dataframes

In [37]:
# transforming a column with "apply"
# The .apply() method can be used on a pandas DataFrame
# to apply an arbitrary Python function to every element. 

# use the .apply method to apply a functions output across a column
# FUTURE - create some examples for this dataset.

# use the .map method to transform values based on a dictionary lookup
# Note: both use expensive FOR loops behind the scenes to make the computations

In [40]:
# Create the dictionary: red_vs_blue
red_vs_blue = {'Obama':'Blue', 'Romney':'Red'}

In [41]:
red_vs_blue

{'Obama': 'Blue', 'Romney': 'Red'}

In [42]:
election['color'] = election.winner.map(red_vs_blue)

In [43]:
election
# now there is a new column with our values

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin,color
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Adams,PA,41973.0,35.482334,63.112001,Romney,61156.0,68.632677,27.629667,Red
Allegheny,PA,614671.0,56.640219,42.185820,Obama,924351.0,66.497575,14.454399,Blue
Armstrong,PA,28322.0,30.696985,67.901278,Romney,42147.0,67.198140,37.204293,Red
Beaver,PA,80015.0,46.032619,52.637630,Romney,115157.0,69.483401,6.605012,Red
Bedford,PA,21444.0,22.057452,76.986570,Romney,32189.0,66.619031,54.929118,Red
...,...,...,...,...,...,...,...,...,...
Washington,PA,90078.0,42.744066,56.012567,Romney,142331.0,63.287689,13.268501,Red
Wayne,PA,20966.0,38.815225,59.768196,Romney,32577.0,64.358290,20.952971,Red
Westmoreland,PA,168709.0,37.567646,61.306154,Romney,238006.0,70.884347,23.738508,Red
Wyoming,PA,11214.0,42.910647,55.189941,Romney,17255.0,64.989858,12.279294,Red


In [44]:
# vectorizing functions - these do computation as compile code speeds.
# these consist of UFUNCS (Universal Functions in Numpy)

In [45]:
# Import zscore from scipy.stats
from scipy.stats import zscore

In [51]:
turnout_zscore = zscore(election['turnout'])
turnout_zscore # there is some weird error here..

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan])

In [52]:
# Assign turnout_zscore to a new column: election['turnout_zscore']
election['turnout_zscore'] = turnout_zscore

In [50]:
election.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin,color,turnout_zscore
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Adams,PA,41973.0,35.482334,63.112001,Romney,61156.0,68.632677,27.629667,Red,
Allegheny,PA,614671.0,56.640219,42.18582,Obama,924351.0,66.497575,14.454399,Blue,
Armstrong,PA,28322.0,30.696985,67.901278,Romney,42147.0,67.19814,37.204293,Red,
Beaver,PA,80015.0,46.032619,52.63763,Romney,115157.0,69.483401,6.605012,Red,
Bedford,PA,21444.0,22.057452,76.98657,Romney,32189.0,66.619031,54.929118,Red,


## 6.0 Dataframe Indexes

In [54]:
sales = pd.read_csv('sales.csv')
sales

Unnamed: 0,month,eggs,salt,spam
0,Jan,47,12.0,17
1,Feb,110,50.0,31
2,Mar,221,89.0,72
3,Apr,77,87.0,20
4,May,132,,52
5,Jun,205,60.0,55


Indexes are immutable objects. This means that if you want to change or modify the index in a DataFrame, then you need to change the whole index. 

In [56]:
sales.index

RangeIndex(start=0, stop=6, step=1)


In [57]:
sales.month.str.upper()

0    JAN
1    FEB
2    MAR
3    APR
4    MAY
5    JUN
Name: month, dtype: object

In [66]:
sales['month'] = sales['month'].apply(lambda x: x.upper())
sales

PRODUCTS,month,eggs,salt,spam
MONTH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,JAN,47,12.0,17
1,FEB,110,50.0,31
2,MAR,221,89.0,72
3,APR,77,87.0,20
4,MAY,132,,52
5,JUN,205,60.0,55


In [67]:
sales.index.name = 'MONTH'

In [68]:
sales

PRODUCTS,month,eggs,salt,spam
MONTH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,JAN,47,12.0,17
1,FEB,110,50.0,31
2,MAR,221,89.0,72
3,APR,77,87.0,20
4,MAY,132,,52
5,JUN,205,60.0,55


In [69]:
sales.columns.name = 'PRODUCTS'

In [70]:
sales

PRODUCTS,month,eggs,salt,spam
MONTH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,JAN,47,12.0,17
1,FEB,110,50.0,31
2,MAR,221,89.0,72
3,APR,77,87.0,20
4,MAY,132,,52
5,JUN,205,60.0,55


In [None]:
# hierarchical indexing

In [75]:
sales2 = pd.read_csv('sales2.csv', index_col='state')

In [76]:
sales2

Unnamed: 0_level_0,month,eggs,salt,spam
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1,47,12.0,17
CA,2,110,50.0,31
NY,1,221,89.0,72
NY,2,77,87.0,20
TX,1,132,,52
TX,2,205,60.0,55


## 7.0 Dataframe Pivoting

In [112]:
users = pd.read_csv('users.csv')

In [81]:
users.weekday

Unnamed: 0.1,Unnamed: 0,weekday,city,visitors,signups
0,0,Sun,Austin,139,7
1,1,Sun,Dallas,237,12
2,2,Mon,Austin,326,3
3,3,Mon,Dallas,456,5


In [86]:
# Pivot users pivoted by both signups and visitors: pivot
pivot = users.pivot(index='weekday',columns='city')
pivot

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0,visitors,visitors,signups,signups
city,Austin,Dallas,Austin,Dallas,Austin,Dallas
weekday,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Mon,2,3,326,456,3,5
Sun,0,1,139,237,7,12


In [84]:
# Pivot the users DataFrame: visitors_pivot
visitors_pivot = users.pivot(index='weekday',columns='city',values='visitors')
visitors_pivot

city,Austin,Dallas
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,326,456
Sun,139,237


In [89]:
# Pivot users with signups indexed by weekday and city: signups_pivot
signups_pivot = users.pivot(index='weekday',columns='city',values='signups')
signups_pivot

city,Austin,Dallas
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,3,5
Sun,7,12


In [151]:
# Create the DataFrame with the appropriate pivot table: by_city_day
by_city_day = users.pivot_table(index='weekday', columns='city')

In [152]:
# Print by_city_day
print(by_city_day)

        Unnamed: 0        signups        visitors       
city        Austin Dallas  Austin Dallas   Austin Dallas
weekday                                                 
Mon              2      3       3      5      326    456
Sun              0      1       7     12      139    237


In [153]:
# Use a pivot table to display the count of each column: count_by_weekday1
count_by_weekday1 = users.pivot_table(index='weekday', aggfunc='count')

In [154]:
# Print count_by_weekday
print(count_by_weekday1)

         Unnamed: 0  signups  visitors
weekday                               
Mon               2        2         2
Sun               2        2         2


In [155]:
# Replace 'aggfunc='count'' with 'aggfunc=len': count_by_weekday2
count_by_weekday2 = users.pivot_table(index='weekday', aggfunc=len)

In [156]:
# Verify that the same result is obtained
print('==========================================')
print(count_by_weekday1.equals(count_by_weekday2))

True


In [157]:
# Create the DataFrame with the appropriate pivot table: signups_and_visitors
signups_and_visitors = users.pivot_table(index='weekday', aggfunc=sum)

In [158]:
# Print signups_and_visitors
print(signups_and_visitors)

         Unnamed: 0  signups  visitors
weekday                               
Mon               5        8       782
Sun               1       19       376


In [159]:
# Add in the margins: signups_and_visitors_total 
signups_and_visitors_total = users.pivot_table(index='weekday', aggfunc=sum, margins=True)

In [160]:
# Print signups_and_visitors_total
print(signups_and_visitors_total)

         Unnamed: 0  signups  visitors
weekday                               
Mon               5        8       782
Sun               1       19       376
All               6       27      1158


## 8.0 Stacking and Unstacking Dataframes

In [113]:
users2 = pd.read_csv('users.csv')

In [114]:
type(users2)

pandas.core.frame.DataFrame

In [115]:
users

Unnamed: 0.1,Unnamed: 0,weekday,city,visitors,signups
0,0,Sun,Austin,139,7
1,1,Sun,Dallas,237,12
2,2,Mon,Austin,326,3
3,3,Mon,Dallas,456,5


In [118]:
users2 = users2.set_index(['city', 'weekday'])
users2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,visitors,signups
city,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Austin,Sun,0,139,7
Dallas,Sun,1,237,12
Austin,Mon,2,326,3
Dallas,Mon,3,456,5


In [119]:
users2 = users2.sort_index()

In [120]:
users2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,visitors,signups
city,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Austin,Mon,2,326,3
Austin,Sun,0,139,7
Dallas,Mon,3,456,5
Dallas,Sun,1,237,12


In [129]:
# dropped a column (users2.drop =....) , the old index column:
users2.columns

Index(['Unnamed: 0', 'visitors', 'signups'], dtype='object')

In [133]:
users2

Unnamed: 0_level_0,Unnamed: 1_level_0,visitors,signups
city,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1
Austin,Mon,326,3
Austin,Sun,139,7
Dallas,Mon,456,5
Dallas,Sun,237,12


In [135]:
# Unstack users by 'weekday': byweekday
byweekday = users2.unstack(level='weekday')

In [136]:
byweekday

Unnamed: 0_level_0,visitors,visitors,signups,signups
weekday,Mon,Sun,Mon,Sun
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Austin,326,139,3,7
Dallas,456,237,5,12


In [137]:
print(byweekday.stack(level='weekday'))

                visitors  signups
city   weekday                   
Austin Mon           326        3
       Sun           139        7
Dallas Mon           456        5
       Sun           237       12


In [139]:
# Unstack users by 'city': bycity
bycity = users2.unstack(level='city')

In [140]:
# Print the bycity DataFrame
print(bycity)

        visitors        signups       
city      Austin Dallas  Austin Dallas
weekday                               
Mon          326    456       3      5
Sun          139    237       7     12


In [141]:
# Stack bycity by 'city' and print it
print(bycity.stack(level='city'))

                visitors  signups
weekday city                     
Mon     Austin       326        3
        Dallas       456        5
Sun     Austin       139        7
        Dallas       237       12


In [142]:
# Stack 'city' back into the index of bycity: newusers
newusers = bycity.stack(level='city')

In [143]:
# Swap the levels of the index of newusers: newusers
newusers = newusers.swaplevel(0, 1)

In [144]:
# Print newusers and verify that the index is not sorted
print(newusers)

                visitors  signups
city   weekday                   
Austin Mon           326        3
Dallas Mon           456        5
Austin Sun           139        7
Dallas Sun           237       12


In [145]:
# Sort the index of newusers: newusers
newusers = newusers.sort_index()

In [146]:
# Print newusers and verify that the index is now sorted
print(newusers)

                visitors  signups
city   weekday                   
Austin Mon           326        3
       Sun           139        7
Dallas Mon           456        5
       Sun           237       12


In [147]:
# Verify that the new DataFrame is equal to the original
print(newusers.equals(users))

False


## 9.0 Melting Dataframes

In [None]:
# FUTURE: this section requires more research. it was terribly presented.

## 10.0 Grouping Data

In [161]:
titanic = pd.read_csv('titanic.csv')

In [162]:
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [164]:
# Group titanic by 'pclass'
by_class = titanic.groupby('pclass')
by_class

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002AFD1515A08>

In [169]:
# Aggregate 'survived' column of by_class by count
count_by_class = by_class.survived.count()
count_by_class

pclass
1    323
2    277
3    709
Name: survived, dtype: int64

In [167]:
# Group titanic by 'embarked' and 'pclass'
by_mult = titanic.groupby(['embarked','pclass'])
by_mult

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002AFD1587948>

In [170]:
# Aggregate 'survived' column of by_mult by count
count_mult = by_mult.survived.count()
count_mult

embarked  pclass
C         1         141
          2          28
          3         101
Q         1           3
          2           7
          3         113
S         1         177
          2         242
          3         495
Name: survived, dtype: int64

In [173]:
# aggregations
# Group titanic by 'pclass': by_class
by_class = titanic.groupby('pclass')

In [174]:
# Select 'age' and 'fare'
by_class_sub = by_class[['age','fare']]

In [175]:
# Aggregate by_class_sub by 'max' and 'median': aggregated
aggregated = by_class_sub.agg(['max','median'])

In [176]:
# Print the maximum age in each class
print(aggregated.loc[:, ('age','max')])

pclass
1    80.0
2    70.0
3    74.0
Name: (age, max), dtype: float64


In [177]:
# Print the median fare in each class
print(aggregated.loc[:, ('fare','median')])

pclass
1    60.0000
2    15.0458
3     8.0500
Name: (fare, median), dtype: float64


In [233]:
# Create a groupby object using titanic over the 'sex' column: by_sex
by_sex = titanic.groupby('sex')

In [234]:
def c_deck_survival(gr):

    c_passengers = gr['cabin'].str.startswith('C').fillna(False)

    return gr.loc[c_passengers, 'survived'].mean()

In [235]:
# Call by_sex.apply with the function c_deck_survival
c_surv_by_sex = by_sex.apply(c_deck_survival)

In [236]:
# Print the survival rates
print(c_surv_by_sex)

sex
female    0.913043
male      0.312500
dtype: float64


In [201]:
## more aggregation using a dictionary
# Read the CSV file into a DataFrame and sort the index: gapminder
gapminder = pd.read_csv('gapminder_tidy.csv', index_col=['Year','region','Country']).sort_index()

In [202]:
gapminder.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fertility,life,population,child_mortality,gdp
Year,region,Country,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1964,America,Antigua and Barbuda,4.25,63.775,58653.0,72.78,5008.0
1964,America,Argentina,3.068,65.388,21966478.0,57.43,8227.0
1964,America,Aruba,4.059,67.113,57031.0,,5505.0
1964,America,Bahamas,4.22,64.189,133709.0,48.56,18160.0
1964,America,Barbados,4.094,62.819,234455.0,64.7,5681.0


In [203]:
# Group gapminder by 'Year' and 'region': by_year_region
by_year_region = gapminder.groupby(level=['Year','region'])

by_year_region

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002AFD1589588>

In [204]:
# Define the function to compute spread: spread
def spread(series):
    return series.max() - series.min()

In [205]:
# Create the dictionary: aggregator
aggregator = {'population':'sum', 'child_mortality':'mean', 'gdp':spread}

In [207]:
# Aggregate by_year_region using the dictionary: aggregated
aggregated = by_year_region.agg(aggregator)

In [208]:
# Print the last 6 entries of aggregated 
print(aggregated.tail(6))

                                   population  child_mortality       gdp
Year region                                                             
2013 America                     9.629087e+08        17.745833   49634.0
     East Asia & Pacific         2.244209e+09        22.285714  134744.0
     Europe & Central Asia       8.968788e+08         9.831875   86418.0
     Middle East & North Africa  4.030504e+08        20.221500  128676.0
     South Asia                  1.701241e+09        46.287500   11469.0
     Sub-Saharan Africa          9.205996e+08        76.944490   32035.0


In [213]:
## zscores usage
# Import zscore
from scipy.stats import zscore

In [214]:
## more aggregation using a dictionary
# Read the CSV file into a DataFrame and sort the index: gapminder
gp = pd.read_csv('gapminder_tidy.csv', index_col='Country')

In [215]:
gp.head()

Unnamed: 0_level_0,Year,fertility,life,population,child_mortality,gdp,region
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Afghanistan,1964,7.671,33.639,10474903.0,339.7,1182.0,South Asia
Afghanistan,1965,7.671,34.152,10697983.0,334.1,1182.0,South Asia
Afghanistan,1966,7.671,34.662,10927724.0,328.7,1168.0,South Asia
Afghanistan,1967,7.671,35.17,11163656.0,323.3,1173.0,South Asia
Afghanistan,1968,7.671,35.674,11411022.0,318.1,1187.0,South Asia


In [219]:
# Group gapminder_2010: standardized
standardized = gp.groupby('region')['life','fertility'].transform(zscore)

In [220]:
standardized

Unnamed: 0_level_0,life,fertility
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,-2.289170,1.477737
Afghanistan,-2.239195,1.477737
Afghanistan,-2.189513,1.477737
Afghanistan,-2.140025,1.477737
Afghanistan,-2.090927,1.477737
...,...,...
Åland,1.917707,
Åland,1.690350,
Åland,1.544608,
Åland,1.567927,


In [222]:
# Construct a Boolean Series to identify outliers: outliers
outliers = (standardized['life'] < -3) | (standardized['fertility'] > 3)
outliers

Country
Afghanistan    False
Afghanistan    False
Afghanistan    False
Afghanistan    False
Afghanistan    False
               ...  
Åland          False
Åland          False
Åland          False
Åland          False
Åland          False
Length: 10111, dtype: bool

In [223]:
# Filter gapminder_2010 by the outliers: gm_outliers
gm_outliers = gp.loc[outliers]

In [224]:
# Print gm_outliers
print(gm_outliers)

              Year  fertility    life  population  child_mortality     gdp  \
Country                                                                      
Bolivia       1964      6.607  43.913   3668568.0           265.40  2971.0   
Bolivia       1965      6.593  44.229   3752892.0           260.10  3046.0   
Bolivia       1966      6.586  44.536   3839751.0           254.50  3191.0   
Bolivia       1967      6.585  44.835   3929192.0           248.80  3312.0   
Bolivia       1968      6.587  45.132   4021551.0           243.00  3510.0   
...            ...        ...     ...         ...              ...     ...   
Turkey        1976      4.872  56.050  40446729.0           152.70  9142.0   
Turkmenistan  1964      6.663  56.181   1829697.0           132.98  7952.0   
Yemen, Rep.   1964      7.385  36.068   5527652.0              NaN     NaN   
Yemen, Rep.   1965      7.418  36.957   5632206.0              NaN     NaN   
Yemen, Rep.   1966      7.447  37.827   5737000.0              N

In [229]:
def disparity(gr):
    # Compute the spread of gr['gdp']: s
    s = gr['gdp'].max() - gr['gdp'].min()
    # Compute the z-score of gr['gdp'] as (gr['gdp']-gr['gdp'].mean())/gr['gdp'].std(): z
    z = (gr['gdp'] - gr['gdp'].mean())/gr['gdp'].std()
    # Return a DataFrame with the inputs {'z(gdp)':z, 'regional spread(gdp)':s}
    return pd.DataFrame({'z(gdp)':z , 'regional spread(gdp)':s})

In [230]:
# Group gapminder_2010 by 'region': regional
regional = gp.groupby('region')

In [231]:
regional

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002AFD19E8F48>

In [None]:
# Apply the disparity function on regional: reg_disp
reg_disp = regional.apply(disparity)

In [None]:
# Print the disparity of 'United States', 'United Kingdom', and 'China'
print(reg_disp.loc[['United States','United Kingdom','China']])