# Data Indexing and Selection Examples

2020-09-28 - Jeff Smith

In [None]:
# Getting things ready
import pandas as pd

def show(data, show_data = 0):
    print ("   Type: {:}".format(type(data)))
    print ("  Index: {:}".format(data.index))
    print ("  Shape: {:}".format(data.shape))
    print ("Columns: {:}".format(data.columns))
    for col in data.columns:
        print ("    {:} ({:})".format(col,data[col].dtype))
    if show_data:
        print(data.values)
        
print("Pandas version: {:}".format(pd.__version__))

filepath = "../data/"

# US Economic Data dataset
## Default read_csv()

In [None]:
# Start with the US Economic Data dataset.
df1 = pd.read_csv(filepath + '10_us_economic_data.csv')
show(df1)

In [None]:
# Extracting a single column (as a Series)
# Month - Either syntax works since the column name is alpha and is not a keyword.
df1.Month
#df1["Month"]

In [None]:
# Extracting multiple Columns
df1[['Month', 'UnemploymentRate']]

In [None]:
# What about the first 12 entries (the first year of data)?
df1.loc[0:11]
# why does the row for index 11 show up?  For normal Python slices, it would not.
# Try iloc rather than loc.

In [None]:
# What month had the highest unemployment rate?
df1.loc[df1.UnemploymentRate == df1.UnemploymentRate.max()]

In [None]:
# What months had the unemployment rate greater than or equal to 9.5
df1[df1.UnemploymentRate >= 9.5]
# Here the index expression is a mask -- see the next cell

In [None]:
# the mask
df1.UnemploymentRate >= 9.5

In [None]:
# What about the Unemployment Rate and Jobs Added numbers for 
# 2010?
df1[['Month', 'UnemploymentRate','JobsAdded']].loc[24:35]

In [None]:
# What aobut the JobsAdded for the 12 month period after the month 
# with the maximum unemployment rate
# From the previous query, I knwo that max unemployment occured at loc 21
df1[['Month', 'JobsAdded', 'UnemploymentRate']].loc[22:33]

In [None]:
# or -- by using the argmax ("argument for entry with maximum value") function:
df1[['Month', 'JobsAdded', 'UnemploymentRate']].loc[df1.UnemploymentRate.argmax()+1:df1.UnemploymentRate.argmax()+12]

## Enhanced call to read_csv()

In [None]:
# Here, if I want to use the Month as the index AND specify that Pandas
# parse the dates for the index:
df2 = pd.read_csv(filepath + '10_us_economic_data.csv', parse_dates = ['Month'], index_col=0)
show(df2)

In [None]:
df2.head()

In [None]:
df2.iloc[0:12]

In [None]:
df2.loc['2008-02-01']

In [None]:
df2.loc['2012-01-01':'2012-12-01']

In [None]:
df2.JobsAdded.loc['2012-01-01':'2012-12-01']