# Data Indexing and Selection Examples

2020-09-28 - Jeff Smith

In [None]:
# Getting things ready
import numpy as np
import pandas as pd
from datetime import datetime

def show(data, show_data = 0):
    print ("   Type: {:}".format(type(data)))
    print ("  Index: {:}".format(data.index))
    print ("  Shape: {:}".format(data.shape))
    print ("Columns: {:}".format(data.columns))
    for col in data.columns:
        print ("    {:} ({:})".format(col,data[col].dtype))
    if show_data == 1:
        print(data.head())
    elif show_data == 2:
        print(data.values)
        
print("Pandas version: {:}".format(pd.__version__))

filepath = "../data/"

## US Economic Data dataset
## Default read_csv()

In [None]:
# Start with the US Economic Data dataset.
df1 = pd.read_csv(filepath + '10_us_economic_data.csv')
show(df1,1)

In [None]:
# Or, using the built-in display function
df1.head(7)

In [None]:
# Extracting a single column (as a Series)
# Month - Either syntax works since the column name is alpha and is not a keyword.
df1.Month
#df1["Month"]

In [None]:
# Extracting multiple Columns
df1[['Month', 'UnemploymentRate']]
# note the double brackets

In [None]:
# What about the first 12 entries (the first year of data)?
df1.loc[0:11]
# why does the row for index 11 show up?  For normal Python slices, it would not (why not?)
# Try iloc rather than loc.

In [None]:
# What month had the highest unemployment rate?
rate = df1.UnemploymentRate.max()
df1.loc[df1.UnemploymentRate == rate]

In [None]:
df1.UnemploymentRate == rate

In [None]:
# or all at once
df1.loc[df1.UnemploymentRate == df1.UnemploymentRate.max()]

In [None]:
# Can also apply the max function to the DataFrame
df1.max()
# Also try min(), sum(), std() and others ..
# https://pandas.pydata.org/pandas-docs/stable/reference/frame.html

In [None]:
# What months had the unemployment rate greater than or equal to 9.5
df1[df1.UnemploymentRate >= 9.5]
# Here the index expression is a mask -- see the next cell

In [None]:
# the mask
df1.UnemploymentRate >= 9.5

In [None]:
# What about the Unemployment Rate and Jobs Added numbers for 
# the year 2010?
df1[['Month', 'UnemploymentRate','JobsAdded']].loc[24:35]

In [None]:
# or
df1.iloc[24:36, :3]

In [None]:
# or
df1.loc[24:35,['Month', 'UnemploymentRate','JobsAdded']]

In [None]:
# What aobut the JobsAdded for the 12 month period after the month 
# with the maximum unemployment rate
# From the previous query, I know that max unemployment occured at loc 21
df1[['Month', 'JobsAdded', 'UnemploymentRate']].loc[22:33]

In [None]:
# or -- by using the argmax ("argument for entry with maximum value") function:
df1[['Month', 'JobsAdded', 'UnemploymentRate']].loc[df1.UnemploymentRate.argmax()+1:df1.UnemploymentRate.argmax()+12]

## Enhanced call to read_csv()

In the default version of the DataFrame, the date was read as string objects.  This makes date arithmetic clunky.  Let's try reading them in as datetime objects and setting the month to be the DataFrame index.

In [None]:
# Here, if I want to use the Month as the index AND specify that Pandas
# parse the dates for the index:
df2 = pd.read_csv(filepath + '10_us_economic_data.csv', parse_dates = ['Month'], index_col=0)
show(df2,1)

In [None]:
# Note that we can alway using the native Python (implicit) referencing.
df2.iloc[0:12]

In [None]:
# But now we can use dates.  Note also that we're using a string rather
# that a datetime object
df2.loc['2008-02-01']

In [None]:
# what about a different date format?
df2.loc['02/01/08']

In [None]:
# Now we can do a direct date range.
df2.loc['2012-01-01':'2012-12-01']

In [None]:
df2.JobsAdded.loc['2012-01-01':'2012-12-01']

## Dealing with Missing Values - None and NaN Values

In [None]:
# what about null values?
df2.isnull().head(10)
# GDP is reported quarterly rather than monthly
# Note that the .head(10) is because I only want to see the first 10 rows.

In [None]:
# NumPy functions do not handle NA and NaN value "well."
x = np.array(df2.GDP)
np.mean(x)
# Why the quotes on "well"?  It's not technically incorrect,
# but it's generally not what we want (unless we do).

In [None]:
# Remove the NA values.  Note that the entire
# records (rows) are removed, not just the specific values.
df2.dropna()

In [None]:
# Now we can use NumPy on the "good" values
x = np.array(df2.dropna()['GDP'])
np.mean(x)
# Why the quotes on "good"?

In [None]:
# Or we can just use Pandas on the Series with the NA and NaN values
df2.GDP.mean()

In [None]:
# And we can use the Pandas functions across the DataFrame
df2.mean()

In [None]:
# What about non-numeric columns?  Recall that df1 has a Date (object) column.
df1.mean()

In [None]:
# We can tell Pandas to apply the function to non-numeric columns (not
# sure why we'd want to do this, but we can).
df1.mean(numeric_only=False)
# See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mean.html for details.

## Example Use-case With Pandas and Concatenation
See the slide set for details of the problem.

In [None]:
# start with df2
df2 = pd.read_csv(filepath + '10_us_economic_data.csv', parse_dates = ['Month'], index_col=0)
df2.head(6)

In [None]:
# Extract the gdp data and drop the nan values
g = df2.GDP.dropna()
g.head(6)

In [None]:
# Create a series using the 2008 data (first 4 rows)
y1 = pd.Series(data = g.iloc[0:4].values, index=['Q1', 'Q2', 'Q3', 'Q4'], name="2008")
y1

In [None]:
# Replicate for year 2
y2 = pd.Series(data = g.iloc[4:8].values, index=['Q1', 'Q2', 'Q3', 'Q4'], name="2009")
y2

In [None]:
# Concatenate them together
gdp = pd.concat([y1, y2], axis=1)
show(gdp,1)

In [None]:
gdp.head()

In [None]:
# Now just replicate for years 3, 4,and 5 .... or create a small loop that automates the process.