In [3]:
import pandas as pd
import numpy as np

## MERGING DATAFRAMES

In pandas, we can merge two dataframes either horizontally or vertically.

Merging two DataFrames horizontally refers to combining them side by side, based on a common key or index. This is typically done using the merge() function, where you specify a column (or columns) that both DataFrames share, and the rows are aligned according to matching values in that column. The result is a wider DataFrame with columns from both original DataFrames. For example, if you have one DataFrame with customer IDs and names, and another with customer IDs and purchase amounts, a horizontal merge on the customer ID would give you a single DataFrame with columns for IDs, names, and purchase amounts, with rows matched up where the IDs are the same. The operation is similar to a SQL JOIN (e.g., inner, left, right, or outer join), depending on how you set the how parameter.


Concatenating two DataFrames vertically means stacking them on top of each other, adding rows rather than columns. This is done using the concat() function with the default axis setting (axis=0), which appends the rows of one DataFrame to the bottom of another. The columns are aligned by their names, so the DataFrames should ideally have the same or similar column structures. If there are differences, pandas will fill in missing values with NaN. For instance, if you have two DataFrames with the same columns (say, "name" and "age"), concatenating them vertically would result in a taller DataFrame with all the rows from both, one after the other.



### Merging

In [8]:
staff = pd.DataFrame([
    {'name': 'Kelly', 'role': 'Director of HR'},
    {'name': 'James', 'role': 'Grader'}
])
staff = staff.set_index('name')

students = pd.DataFrame([
    {'name': 'Joseph', 'school': 'Computer Science'},
    {'name': 'Mary', 'school': 'Law'},
    {'name': 'James', 'school': 'Engineering'}
])
students = students.set_index('name')
print(staff, students)

                 role
name                 
Kelly  Director of HR
James          Grader                   school
name                    
Joseph  Computer Science
Mary                 Law
James        Engineering


In [9]:
# Merging horizontally (outer, we want everybody so basically a union)
pd.merge(staff, students, how='outer', left_index=True, right_index=True)

Unnamed: 0_level_0,role,school
name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Engineering
Joseph,,Computer Science
Kelly,Director of HR,
Mary,,Law


In [10]:
# Merging but now we want the intersection
pd.merge(staff, students, how='inner', left_index=True, right_index=True)

Unnamed: 0_level_0,role,school
name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Engineering


In [11]:
# A different way, using on
staff = staff.reset_index()
students = students.reset_index()

pd.merge(staff, students, how='outer', on='name')

Unnamed: 0,name,role,school
0,James,Grader,Engineering
1,Joseph,,Computer Science
2,Kelly,Director of HR,
3,Mary,,Law


### Concatenanting

In [14]:
pd.concat([staff, students], keys=['staff', 'student'])

Unnamed: 0,Unnamed: 1,name,role,school
staff,0,Kelly,Director of HR,
staff,1,James,Grader,
student,0,Joseph,,Computer Science
student,1,Mary,,Law
student,2,James,,Engineering


## PANDAS IDIOMS

In [5]:
df = pd.read_csv('datasets/census.csv', encoding='latin-1')
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


### Method chaining

In [6]:
# Surprisingly this takes more time than doing it line by line
(df.where(df['SUMLEV']==50)
    .dropna()
    .set_index(['STNAME', 'CTYNAME'])
    .rename(columns={'ESTIMATESBASE2010': 'Estimate base 2010'}))

Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,CENSUS2010POP,Estimate base 2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,Autauga County,50.0,3.0,6.0,1.0,1.0,54571.0,54571.0,54660.0,55253.0,55175.0,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.592270,-2.187333
Alabama,Baldwin County,50.0,3.0,6.0,1.0,3.0,182265.0,182265.0,183193.0,186659.0,190396.0,...,14.832960,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
Alabama,Barbour County,50.0,3.0,6.0,1.0,5.0,27457.0,27457.0,27341.0,27226.0,27159.0,...,-4.728132,-2.500690,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
Alabama,Bibb County,50.0,3.0,6.0,1.0,7.0,22915.0,22919.0,22861.0,22733.0,22642.0,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
Alabama,Blount County,50.0,3.0,6.0,1.0,9.0,57322.0,57322.0,57373.0,57711.0,57776.0,...,1.807375,-1.177622,-1.748766,-2.062535,-1.369970,1.859511,-0.848580,-1.402476,-1.577232,-0.884411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wyoming,Sweetwater County,50.0,4.0,8.0,56.0,37.0,43806.0,43806.0,43593.0,44041.0,45104.0,...,1.072643,16.243199,-5.339774,-14.252889,-14.248864,1.255221,16.243199,-5.295460,-14.075283,-14.070195
Wyoming,Teton County,50.0,4.0,8.0,56.0,39.0,21294.0,21294.0,21297.0,21482.0,21697.0,...,-1.589565,0.972695,19.525929,14.143021,-0.564849,0.654527,2.408578,21.160658,16.308671,1.520747
Wyoming,Uinta County,50.0,4.0,8.0,56.0,41.0,21118.0,21118.0,21102.0,20912.0,20989.0,...,-17.755986,-4.916350,-6.902954,-14.215862,-12.127022,-18.136812,-5.536861,-7.521840,-14.740608,-12.606351
Wyoming,Washakie County,50.0,4.0,8.0,56.0,43.0,8533.0,8533.0,8545.0,8469.0,8443.0,...,-11.637475,-0.827815,-2.013502,-17.781491,1.682288,-11.990126,-1.182592,-2.250385,-18.020168,1.441961
