<a href="https://colab.research.google.com/github/c-marq/CAP3321C-Data-Wrangling/blob/main/demos/ch07_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 7 - How to prepare the data

In [None]:
import pandas as pd

## How to work with datetime columns

In [None]:
fires = pd.read_pickle('fires_cleaned.pkl')
fires.head()

In [None]:
fires['fire_month'] = fires.discovery_date.dt.month

In [None]:
fires['days_burning'] = (fires.contain_date - fires.discovery_date).dt.days

In [None]:
fires.head()

## How to work with string and numeric columns

In [None]:
fires['fire_name'] = fires.fire_name.str.title()

In [None]:
fires['full_name'] = 'The ' + fires.fire_name + ' Fire ' \
                   + '(' + fires.fire_year.astype(str) + ')'

In [None]:
fires['acres_per_day'] = fires.dropna().acres_burned / fires.dropna().days_burning

In [None]:
fires[['fire_name','full_name','acres_burned','days_burning','acres_per_day']].head()

## How to add summary columns

In [None]:
fires[['state','days_burning']].head()

In [None]:
fires['mean_days'] = fires.groupby('state')['days_burning'].transform(func='mean')
fires[['state','days_burning','mean_days']].head()

## How to apply functions to rows or columns

In [None]:
workData = pd.read_pickle('workData.pkl')
workData.head(3)

In [None]:
workData.apply('mean')

In [None]:
import numpy as np
workData[['sex','hrs1']].apply(np.mean)

In [None]:
workData['avg_rating'] = workData[
    ['wkcontct','talkspvs','effctsup']].apply(np.mean, axis=1)
workData.head(3)

## How to apply user-defined functions

In [None]:
def convert_sex(row):
    if row.sex == 1:
        return 'male'
    elif row.sex == 2:
        return 'female'
    else:
        return 'non-binary'

workData['sex'] = workData.apply(convert_sex, axis=1)
workData.head()

In [None]:
# get the data
gameData = pd.read_pickle('shot_cleaned.pkl')[['game_id','game_date']]
gameData[gameData.duplicated(keep=False)]
gameData.drop_duplicates(keep='first', inplace=True)
gameData

In [None]:
def get_season(row):
    if row.game_date.month > 6:
        season = f'{row.game_date.year}-{row.game_date.year + 1}'
    else:
        season = f'{row.game_date.year - 1}-{row.game_date.year}'
    return season

gameData['season'] = gameData.apply(get_season, axis=1)
with pd.option_context('display.max_rows', 6, 'display.max_columns', None):
    display(gameData)

## How lambda expressions work with DataFrames

In [None]:
df = pd.DataFrame([[0,1,2],[3,4,5]], columns=['col1','col2','col3'])
df

In [None]:
df.apply(lambda x: x.sum() * 2, axis=0)

In [None]:
df.apply(lambda x: x.sum() * 2, axis=1)

## How to apply lambda expressions

In [None]:
workData['wrkstat'] = workData.apply(
    lambda row: 'full-time' if row.wrkstat == 1.0 else 'part-time', axis=1)
workData.head()

In [None]:
carsData = pd.read_csv('cars.csv')

carsData['Brand'] = carsData.apply(lambda x: x.CarName.split()[0], axis=1)
carsData[['CarName','Brand']].head()

## How to set or remove an index

In [None]:
fires_by_month = pd.read_pickle('fires_by_month.pkl')

In [None]:
fires_by_month.set_index('state', inplace=True)
fires_by_month.head(3)

In [None]:
fires_by_month = pd.read_pickle('fires_by_month.pkl')

In [None]:
fires_by_month.set_index(['state','fire_year','fire_month'], inplace=True)
fires_by_month.head(3)

In [None]:
fires_no_index = fires_by_month.reset_index()
fires_no_index.head(3)

## How to unstack indexed data

In [None]:
# get indexed dataset
top5_states = pd.read_pickle('top_states.pkl')
top5_states.head(3)

In [None]:
# unstack the state level
top_wide = top5_states[['days_burning','fire_count']].unstack(level='state')
# top_wide = top5_states[['days_burning','fire_count']].unstack(level=0)
top_wide.head(3)

In [None]:
top_wide = top5_states.unstack(level='state')
top_wide.head(3)

In [None]:
top_wide = top5_states.fire_count.unstack(level='state')
top_wide.head(3)

## How to join DataFrames

In [None]:
# get the shots DataFrame
allShotData = pd.read_pickle('shot_cleaned.pkl')
shots = allShotData.drop(columns=['period','minutes_remaining',
                                  'seconds_remaining','loc_x','loc_y','home_team',
                                  'game_date','shot_attempted_flag','shot_made_flag',
                                  'action_type','visiting_team'])
shots1 = shots.head(2)
shots2 = shots.query('game_id == "0020900030"').head(1)
shots3 = shots.query('game_id == "0020900069"').head(1)
shots = pd.concat([shots1,shots2,shots3], ignore_index=True)
shots.set_index('game_id', inplace = True)
shots

In [None]:
# get the points_by_game DataFrame
points_by_game = pd.read_pickle('pointsScoredGame.pkl')
points_by_game = points_by_game.query('game_id == "0020900015" or game_id == "0020900030" or game_id == "0020900082"')
points_by_game

In [None]:
shots_joined = shots.join(points_by_game, how='inner')
shots_joined

In [None]:
points_by_game2 = points_by_game.copy(deep='true')
points_by_game2['player_name'] = 'Steph Curry'
points_by_game2

In [None]:
shots_joined = shots.join(points_by_game2, lsuffix='_1', rsuffix='_2',
                          how='left')
shots_joined

In [None]:
shots_joined_outer = shots.join(points_by_game2, lsuffix='_1',
                                rsuffix='_2', how='outer')
shots_joined_outer

## Merge

In [None]:
shots2 = shots.reset_index()
shots2

In [None]:
points_by_game2 = points_by_game.reset_index()
points_by_game2

In [None]:
shots_merged = shots2.merge(points_by_game2, on='game_id', how='left')
shots_merged

## Concat

In [None]:
# get the data
top5_fires = fires.sort_values('acres_burned', ascending=False).head(5)
top5_fires = top5_fires.reset_index(drop=True)
top5_fires.head()

In [None]:
fires_1 = top5_fires.iloc[:3]
fires_1

In [None]:
fires_2 = top5_fires.iloc[3:]
fires_2.reset_index(drop=True, inplace=True)
fires_2 = fires_2.drop(columns=['fire_month','days_burning'])
fires_2

In [None]:
fires_concat = pd.concat([fires_1,fires_2], ignore_index=True)
fires_concat.head(10)

## What the SettingWithCopyWarning is warning you about

In [None]:
df = shots.copy(deep=True)

In [None]:
df.head(3)

In [None]:
dfSlice = df.loc['0020900015',:]
dfSlice.loc[:,'player_name'] = 'Curry'

In [None]:
df.head(3)

In [None]:
dfSlice.head(3)

## When the SettingWithCopyWarning is given

### Generates the warning but no corruption

In [None]:
df = shots.copy(deep=True)

In [None]:
dfSlice = df.query('game_id == "0020900015"')
dfSlice.loc[:,'player_name'] = 'Curry'
df.head(2)

In [None]:
dfSlice.head(2)

#### How use the copy() method to stop the warning message

In [None]:
dfFixed = df.query('game_id == "0020900015"').copy()
dfFixed.loc[:,'player_name'] = 'Curry'

### Generates the warning and corrupts the data

In [None]:
df = shots.copy(deep=True)

In [None]:
dfSlice = df.loc['0020900015',:]
dfSlice.loc[:,'player_name'] = 'Curry'
df.head(2)

In [None]:
dfSlice.head(2)

#### How to fix this code

In [None]:
df = shots.copy(deep=True)

In [None]:
dfFixed = df.loc['0020900015',:].copy()
dfFixed.loc[:,'player_name'] = 'Curry'
df.head(2)

In [None]:
dfFixed.head(2)

## When the SettingWithCopyWarning isnâ€™t given

In [None]:
df = shots.copy(deep=True)

In [None]:
dfSlice = df
dfSlice.loc[:,'player_name'] = 'Curry'
df.head(2)

In [None]:
dfSlice.head(2)

### How to fix this code

In [None]:
df = shots.copy(deep=True)

In [None]:
dfFixed = df.copy()
dfFixed.loc[:,'player_name'] = 'Curry'
df.head(2)

In [None]:
dfFixed.head(2)