**Pandas navigation and munging**

Basic pandas functions including data frame creation, importing, navigation and series selection. Advanced pandas functions including merging, apply, and group by.

In [2]:
import pandas as pd

### Data frame indexing and navigation

In [3]:
# Creating a datframe from scratch
staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR'},
                         {'Name': 'Sally', 'Role': 'Course liasion'},
                         {'Name': 'James', 'Role': 'Grader'}])
staff_df = staff_df.set_index('Name')
student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business'},
                           {'Name': 'Mike', 'School': 'Law'},
                           {'Name': 'Sally', 'School': 'Engineering'}])
student_df = student_df.set_index('Name')

In [11]:
# Reading a csv file into a dataframe (can also be pd.read_excel() or pd.read_table() as appropriate )
df = pd.read_csv('City_Zhvi_AllHomes.csv')

In [15]:
# Viewing the first 5 rows of a dataframe
df.head()

Unnamed: 0,RegionID,RegionName,State,Metro,CountyName,SizeRank,1996-04,1996-05,1996-06,1996-07,...,2016-10,2016-11,2016-12,2017-01,2017-02,2017-03,2017-04,2017-05,2017-06,2017-07
0,6181,New York,NY,New York,Queens,1,,,,,...,616100,622100,626700.0,630300,636800,646200,657400,670800,681000,686400
1,12447,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,2,155000.0,154600.0,154400.0,154200.0,...,598300,604900,609700.0,612400,616400,621800,626000,628900,630900,632000
2,17426,Chicago,IL,Chicago,Cook,3,109700.0,109400.0,109300.0,109300.0,...,210900,212800,215300.0,218200,220400,221100,221800,222400,222900,223400
3,13271,Philadelphia,PA,Philadelphia,Philadelphia,4,50000.0,49900.0,49600.0,49400.0,...,132100,132500,133500.0,134700,135800,136500,136900,137700,138500,138900
4,40326,Phoenix,AZ,Phoenix,Maricopa,5,87200.0,87700.0,88200.0,88400.0,...,201200,203200,205100.0,206600,207900,209100,210000,211800,214100,215800


In [17]:
# Viewing the last 5 rows of a dataframe
df.tail()

Unnamed: 0,RegionID,RegionName,State,Metro,CountyName,SizeRank,1996-04,1996-05,1996-06,1996-07,...,2016-10,2016-11,2016-12,2017-01,2017-02,2017-03,2017-04,2017-05,2017-06,2017-07
11251,398292,Town of Wrightstown,WI,Green Bay,Brown,11252,,,,,...,167500,169500,171400.0,173600,176100,177300,177100,177800,178800,179600
11252,398343,Urbana,NY,Corning,Steuben,11253,66900.0,65800.0,65500.0,65100.0,...,152700,154100,153300.0,155100,156500,153100,148100,146600,144200,140900
11253,398496,New Denmark,WI,Green Bay,Brown,11254,,,,,...,200400,202400,204900.0,207100,209200,211000,212400,213700,215300,216600
11254,398839,Angels,CA,,Calaveras,11255,115600.0,116400.0,118000.0,119000.0,...,269100,273100,275500.0,276700,278100,280900,285200,287500,287100,286200
11255,737788,Lebanon Borough,NJ,New York,Hunterdon,11256,143500.0,143200.0,141700.0,140700.0,...,239900,238800,239700.0,241700,241200,238600,235100,232500,235900,241400


In [12]:
# Get a list of columns
df.columns

Index(['RegionID', 'RegionName', 'State', 'Metro', 'CountyName', 'SizeRank',
       '1996-04', '1996-05', '1996-06', '1996-07',
       ...
       '2016-10', '2016-11', '2016-12', '2017-01', '2017-02', '2017-03',
       '2017-04', '2017-05', '2017-06', '2017-07'],
      dtype='object', length=262)

In [14]:
# Accessing a dataframe column by number and creating a new dataframe
df2 = df.iloc[:,:6]

### Data frame sorting and filtering

In [20]:
# Sorting dataframe by column(s)
df.sort_values(by=['State', 'RegionName'], ascending=True, inplace=True)

In [18]:
# Filtering the dataframe by an element in a column and creating a new dataframe
df3 = df[df['Metro']=='San Francisco']

In [19]:
# Accessing a dataframe by row index name and column index name(s)and creating a new series
xSSF = df3.loc[727, '1996-04':]

In [None]:
# see if a pandas column contains a string
dfMetTP2['SampleID'].str.contains('NA12878')

In [21]:
# Accessing the date (re-do)
x['date'] = pd.to_datetime(x.index)

NameError: name 'x' is not defined

### Pivot table

In [None]:
# Cool pivot table example
df = pd.DataFrame({'Account_number':[1,1,2,2,2,3,3], 'Product':['A', 'A', 'A', 'B', 'B','A', 'B']})
df.pivot_table(index='Account_number', columns='Product', aggfunc=len, fill_value=0)

### Merging

In [None]:
pd.merge(staff_df, student_df, how='outer', left_index=True, right_index=True)
pd.merge(staff_df, student_df, how='inner', left_index=True, right_index=True)
# left regardless if they're in the overlap
pd.merge(staff_df, student_df, how='left', left_index=True, right_index=True)

### Method chaining

In [None]:
(df.where(df['SUMLEV']==50)
    .dropna()
    .set_index(['STNAME','CTYNAME'])
    .rename(columns={'ESTIMATESBASE2010': 'Estimates Base 2010'}))

### Lambda

In [None]:
df.apply(lambda x: np.sum(x[rows]), axis=1)

### Other vector functions

- df.iterrows
- df.iterritems
- zip
- enumerate

### Apply function

In [None]:
import numpy as np
def min_max(row):
    data = row[['POPESTIMATE2010',
                'POPESTIMATE2011',
                'POPESTIMATE2012',
                'POPESTIMATE2013',
                'POPESTIMATE2014',
                'POPESTIMATE2015']]
    return pd.Series({'min': np.min(data), 'max': np.max(data)})
df.apply(min_max, axis=1)

### Group by

Common workflow for groupby: split data, apply function, then combine results (split, apply, combine function).
Groupby object has agg method (aggregate). This method applies a function to the column or columns of data in the group, and returns the results.

In [28]:
# Need to update table
import pandas as pd
import numpy as np
df = pd.read_csv('census.csv')
df = df[df['SUMLEV']==50]
df

FileNotFoundError: File b'census.csv' does not exist

In [None]:
# Example to get multiple values in a groupby
(df.set_index('STNAME').groupby(level=0)['CENSUS2010POP']
    .agg({'avg': np.average, 'sum': np.sum}))

## Link for  tips on Pandas data manipulation
This [article](https://www.analyticsvidhya.com/blog/2016/01/12-pandas-techniques-python-data-manipulation/) covers the following:
- Boolean indexing
- apply
- imputing missing files
- pivot table
- multi-indexing
- crosstab
- merging data frames
- sorting
- plotting
- cut function for binning
- coding nominal data
- iterating over rows