# Data Preprocessing

Material from Chapter 2 of Larose and Larose, 2015

12/20/2018 - Jeff Smith

In [None]:
% matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.__version__, pd.__version__

## Missing Values

In [None]:
# Read the datasets
cars  = pd.read_csv("../data/cars.txt")
cars2 = pd.read_csv("../data/cars2.txt")
cars3 = pd.read_csv("../data/cars3.csv")

In [None]:
# original dataset
cars.head()

In [None]:
# outliers dataset
cars2.head()

In [None]:
# Dataset with missing values
cars3.head(10)

In [None]:
# Show the rows with missing MPG values
cars3[cars3.mpg.isnull()]

### Remove the rows that have missing values

In [None]:
# revmove the rows that have NaN values (in any column)
cars4 = cars3.dropna()
cars4

### Replace the missing values

In [None]:
# the rows that originally had missing MPG values
# Keep track so that we can come back and check them
cars3.loc[[14, 18, 22, 87,103, 165],]

In [None]:
# suppose that we want to replace the NaN values with the mean MPG value
np.mean(cars3.mpg.values)
# oops ... nan vales

In [None]:
# Option 1 -- use nanmean
np.nanmean(cars3.mpg.values)

In [None]:
# Option 2 -- use the Pandas series directly 
cars3.mpg.mean()

In [None]:
# Standard iteration method
y = cars3.mpg.mean()
for j in range(len(cars3)):
    if np.isnan(cars3.iloc[j,0]):
        cars3.iloc[j,0] = y

In [None]:
# suppose that we want to replace all the NaN's in the numeric columns
# with the mean of that column
means = cars3.mean(axis=0)
means

In [None]:
# Now use a nested loop where the outer loop goes through the columns that
# you want to change.
for i in [0, 2, 3, 4, 5]:
    for j in range(len(cars3)):
        if np.isnan(cars3.iloc[j,i]):
            cars3.iloc[j,i] = means[i]
cars3.head(10)

In [None]:
# Replace NaN values with a sample from the distribution of
# existing values.
# Observed distribution of mpg values -- histogram
# Have to ignore the NAN values
plt.hist(cars3.mpg[~np.isnan(cars3.mpg)]);

In [None]:
# Can sample from the distribution simply by sampling from the actual observed values
# So, we need to sample values from the vector with the NANs removed
mpgs = cars3.mpg[~np.isnan(cars3.mpg)]
# now that we have the vector of actual values, sample 10 random values
for j in range(10):
    print(np.random.choice(mpgs))

In [None]:
# Could use the sampling function with the standard iteration method
# above, but using apply and a lambda function will generally be faster
# for large datasets.
#
# Use a lambda function to replace the NAN value with sampled values
#
def mpg(row):
    return np.random.choice(mpgs) if np.isnan(row.mpg) else row.mpg

cars3['mpg'] = cars3.apply(lambda row: mpg(row),  axis=1)


## Graphical Methods for Identifying Outliers

In [None]:
ax = plt.axes()
ax.set(xlabel='Weight', ylabel='Frequency',
       title='Vehicle Weight');
plt.hist(cars2['weightlbs'])

In [None]:
ax = plt.axes()
ax.set(xlabel='Weight', ylabel='MPG',
       title='MPG vs. Weight');
plt.scatter(cars2['weightlbs'], cars2['mpg'], c=cars2['cylinders']);

## Measures of Center and Spread

In [None]:
cars2.describe()

## Data Transformation

In [None]:
# Look a the columns
cars.columns

### Z-score Standardization

In [None]:
# Z-score standardization of the weight
cars['zweightlbs'] = (cars['weightlbs']-np.mean(cars['weightlbs']))/np.std(cars['weightlbs'])

In [None]:
np.mean(cars['weightlbs']), np.std(cars['weightlbs'])

In [None]:
np.mean(cars['zweightlbs']), np.std(cars['zweightlbs'])

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,5))
ax[0].hist(cars.weightlbs)
ax[0].set(xlabel='Weight (lbs.)', ylabel='Frequency',
       title='Vehicle Weight');
ax[1].hist(cars.zweightlbs)
ax[1].set(xlabel='Z-score of Weight', ylabel='Frequency',
       title='Vehicle Weigth');

### Min-Max Normalization

In [None]:
# min-max normalize the horsepower
cars['mmhp'] = (cars['hp'] - np.min(cars['hp']))/(np.max(cars['hp']) - np.min(cars['hp']))

In [None]:
# note the switch to dot notation for the columns.
np.mean(cars.hp), np.std(cars.hp)

In [None]:
np.mean(cars.mmhp), np.std(cars.mmhp)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,5))
ax[0].hist(cars.hp)
ax[0].set(xlabel='Horsepower', ylabel='Frequency',
       title='Vehicle Horsepower');
ax[1].hist(cars.mmhp)
ax[1].set(xlabel='Min-Max Normallized Weight', ylabel='Frequency',
       title='Vehicle Horsepower');

In [None]:
cars.head(10)