<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Introduction" data-toc-modified-id="Introduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction</a></span></li><li><span><a href="#Load-raw-data" data-toc-modified-id="Load-raw-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load raw data</a></span></li><li><span><a href="#Clean-raw-data" data-toc-modified-id="Clean-raw-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Clean raw data</a></span><ul class="toc-item"><li><span><a href="#Find-and-drop-dulicate-columns" data-toc-modified-id="Find-and-drop-dulicate-columns-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Find and drop dulicate columns</a></span></li><li><span><a href="#Drop-columns-that-have->-80%-missing-values" data-toc-modified-id="Drop-columns-that-have->-80%-missing-values-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Drop columns that have &gt; 80% missing values</a></span></li><li><span><a href="#Drop-columns-with-low-variance,-data-type-int64-or-float64" data-toc-modified-id="Drop-columns-with-low-variance,-data-type-int64-or-float64-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Drop columns with low variance, data type int64 or float64</a></span></li><li><span><a href="#Drop-Categorical-values-with-low-variance-by-converting-to-labels-to-dummy-variables-and-filtering-by-threshold-variance" data-toc-modified-id="Drop-Categorical-values-with-low-variance-by-converting-to-labels-to-dummy-variables-and-filtering-by-threshold-variance-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Drop Categorical values with low variance by converting to labels to dummy variables and filtering by threshold variance</a></span></li><li><span><a href="#Replace-NaN-values-in-numerical-columns-with-median-value-of-series" data-toc-modified-id="Replace-NaN-values-in-numerical-columns-with-median-value-of-series-3.5"><span class="toc-item-num">3.5&nbsp;&nbsp;</span>Replace NaN values in numerical columns with median value of series</a></span></li><li><span><a href="#Find-date-like-columns-and-convert-to-timestamps,-then-to-integer-type" data-toc-modified-id="Find-date-like-columns-and-convert-to-timestamps,-then-to-integer-type-3.6"><span class="toc-item-num">3.6&nbsp;&nbsp;</span>Find date like columns and convert to timestamps, then to integer type</a></span></li><li><span><a href="#Save-cleaned-up-dataframe-as-csv" data-toc-modified-id="Save-cleaned-up-dataframe-as-csv-3.7"><span class="toc-item-num">3.7&nbsp;&nbsp;</span>Save cleaned up dataframe as csv</a></span></li></ul></li><li><span><a href="#Load-cleaned-data" data-toc-modified-id="Load-cleaned-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load cleaned data</a></span></li><li><span><a href="#Preprocess-data" data-toc-modified-id="Preprocess-data-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Preprocess data</a></span></li><li><span><a href="#Models" data-toc-modified-id="Models-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Models</a></span><ul class="toc-item"><li><span><a href="#Logistic-Regression" data-toc-modified-id="Logistic-Regression-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Logistic Regression</a></span></li><li><span><a href="#Support-Vector-Machine" data-toc-modified-id="Support-Vector-Machine-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Support Vector Machine</a></span></li><li><span><a href="#Stochastic-Gradient-Descent-(SGD)" data-toc-modified-id="Stochastic-Gradient-Descent-(SGD)-6.3"><span class="toc-item-num">6.3&nbsp;&nbsp;</span>Stochastic Gradient Descent (SGD)</a></span></li><li><span><a href="#Regression-Forest" data-toc-modified-id="Regression-Forest-6.4"><span class="toc-item-num">6.4&nbsp;&nbsp;</span>Regression Forest</a></span><ul class="toc-item"><li><span><a href="#Decision-Tree" data-toc-modified-id="Decision-Tree-6.4.1"><span class="toc-item-num">6.4.1&nbsp;&nbsp;</span>Decision Tree</a></span></li><li><span><a href="#Random-Forest" data-toc-modified-id="Random-Forest-6.4.2"><span class="toc-item-num">6.4.2&nbsp;&nbsp;</span>Random Forest</a></span></li></ul></li></ul></li></ul></div>

# Introduction 
https://scikit-learn.org/stable/tutorial/machine_learning_map/

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from src import find_duplicates, find_empty_columns, find_low_variance, find_low_var_categories, replace_NaN, time_like, convert_to_timestamp
from src import run_models 

# Load raw data

In [None]:
filename = 'data/raw/targeting_model_data.csv' 
data = pd.read_csv(filename)

In [None]:
data.info()

In [None]:
print('Number of columns: {}'.format(data.shape[1]))
print('Number of rows: {}'.format(data.shape[0]))

In [None]:
# quick look at data 
data.head()

In [None]:
# Composition of target data, this demonstrates imbalanced data. And thus accuracy alone is not a good metric for assessing performance of model. 
data['FLOZVPMFT4626A'].value_counts()

# Clean raw data

## Find and drop dulicate columns

In [None]:
duplicates = find_duplicates(data)

In [None]:
print('Number of duplicate columns dropped: {}'.format(len(duplicates)))

In [None]:
# data frame excluding dropped columns 
df = data.drop(columns=duplicates)

## Drop columns that have > 80% missing values 

In [None]:
empty_columns = find_empty_columns(df)

In [None]:
print('Number of mostly empty columns dropped: {}'.format(len(empty_columns)))

In [None]:
# data frame excluding dropped columns 
df = df.drop(columns=empty_columns)

## Drop columns with low variance, data type int64 or float64 
motivated by the Variance Threshold function https://scikit-learn.org/stable/modules/feature_selection.html

In [None]:
# data frame excluding dropped columns 
low_var_columns = find_low_variance(df)
df.drop(columns=low_var_columns, inplace=True)

In [None]:
print('Number of low variance columns dropped: {}'.format(len(low_var_columns)))

## Drop Categorical values with low variance by converting to labels to dummy variables and filtering by threshold variance

In [None]:
low_v_categories = find_low_var_categories(df)
df.drop(columns=low_v_categories, inplace=True)

In [None]:
print('Number of low variance category columns dropped: {}'.format(len(low_v_categories)))

## Replace NaN values in numerical columns with median value of series 

In [None]:
df= replace_NaN(df)

## Find date like columns and convert to timestamps, then to integer type

Visually inspecting time like columns we can easily find columns that are probably not meant to be timestamps

In [None]:
time_columns = time_like(df)
df[time_columns].head()

In [None]:
# Visually inspecting items that are not potential time stamps
not_time_columns = ['ibe8588DPLHE7435F', 'ibe8840PMLTL7040B']
real_time_columns = [item for item in time_columns if item not in not_time_columns]

In [None]:
# remove extra zeros from column 
df['ibe9152JHMZI9585O'] = df['ibe9152JHMZI9585O']/100

In [None]:
df = convert_to_timestamp(df, real_time_columns)

In [None]:
df[ real_time_columns].head()

## Save cleaned up dataframe as csv

In [None]:
filename = 'data/interim/cleaned_columns.csv'
df.to_csv(path_or_buf=filename, index=False)

# Load cleaned data 

In [2]:
filename = 'data/interim/cleaned_columns.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1271DCBOP1538T,ibe1273MMNAC5195E,ibe1274DFDFF5102Q,ibe1275QYWDP9371S,ibe1280ORQKP6566Z,ibe1281AGNWU9303H,ibe1281VSZLA4159S,ibe2062AHFGH0763Q,...,ibe9153QSXNN0648A,ibe9154GOSYR7154P,ibe9180FFUYI1365V,ibe9181PWJGU8847L,ibe9350NHRIV6568X,ibe9351VNIYI1676Y,ibe9356VXVDJ5952B,ibe9358UBJWE4744M,ibe9509UGCNU4337M,ibe9514RWCHD8503K
0,0,17.0,12B,9.0,01C,01C,35.0,09L,3,1,...,1,C1,,,13.0,13.0,5,9.0,1.0,1.0
1,0,1.0,11B,7.0,01C,01C,11.0,04M,1,0,...,0,A1,L1,,4.0,13.0,B,2.0,1.0,2.0
2,0,61.0,06X,12.0,03C,11C,38.0,12L,3,0,...,0,,,,25.0,5.0,1,15.0,1.0,1.0
3,0,13.0,12B,5.0,05C,07U,14.0,05M,1,0,...,0,,M1,,14.0,5.0,7,7.0,1.0,2.0
4,0,37.0,05X,7.0,05C,08C,33.0,11L,1,0,...,0,B6,,,13.0,10.0,6,10.0,1.0,2.0


# Preprocess data

In [3]:
# Load data 
X =pd.get_dummies(df.iloc[:,1:20]).values
target = df.iloc[:,0].values

In [4]:
# split the data
Xtrain, Xtest, ytrain, ytest = train_test_split(X, target, random_state=2)
print(Xtrain.shape, Xtest.shape)

(75000, 319) (25000, 319)


In [5]:
# preprocess data 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Xtrain)  # fit only on training data
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)  # apply same transformation to test data

In [6]:
# verify preprocess is working correctly 
print('Mean of random array: {}'.format(round(Xtrain[:,8].mean())))
print('Std of random array: {}'.format(round(Xtrain[:,8].std())))

Mean of random array: 0.0
Std of random array: 1.0


# Models 

In [None]:
# label data
target_names = ['0', '1']

## Logistic Regression 

In [None]:
from sklearn.linear_model import LogisticRegression
cl2 = LogisticRegression( class_weight='balanced', fit_intercept=False, solver='liblinear')

t0 = time()
clf.fit(Xtrain, ytrain)
train_time = time() - t0
print("train time: {:0.2f}s".format( train_time))

t0 = time()
ypred = clf.predict(Xtest)
test_time = time() - t0
print('test time: {:0.2f}s \n'.format( test_time))
accuracy_score(ytest, ypred)
print(classification_report(ytest, ypred,target_names=target_names))

## Support Vector Machine

## Stochastic Gradient Descent (SGD)
https://scikit-learn.org/stable/tutorial/machine_learning_map/

https://scikit-learn.org/stable/modules/sgd.html#classification


In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="hinge", max_iter=20, class_weight='balanced',  fit_intercept=False)

t0 = time()
clf.fit(Xtrain, ytrain)
train_time = time() - t0
print("train time: {:0.2f}s".format( train_time))

t0 = time()
ypred = clf.predict(Xtest)
test_time = time() - t0
print('test time: {:0.2f}s \n'.format( test_time))
accuracy_score(ytest, ypred)
print(classification_report(ytest, ypred,target_names=target_names))

## Regression Forest 

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=11, class_weight='balanced')

t0 = time()
clf.fit(Xtrain, ytrain)
train_time = time() - t0
print("train time: {:0.2f}s".format( train_time))

t0 = time()
ypred = clf.predict(Xtest)
test_time = time() - t0
print('test time: {:0.2f}s \n'.format( test_time))
accuracy_score(ytest, ypred)
print(classification_report(ytest, ypred,target_names=target_names))

### Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier( n_estimators=100, n_jobs=-1, class_weight='balanced')



In [None]:
# populate dict with classifiers 
model_dict = {}

Logistic_Regression = LogisticRegression( class_weight='balanced', fit_intercept=False, solver='liblinear')
model_dict['Logistic Regression']  = Logistic_Regression

SGD = SGDClassifier(loss="hinge", max_iter=20, class_weight='balanced',  fit_intercept=False)
model_dict['Stochastic Gradient Descent']  = SGD

DT = DecisionTreeClassifier(max_depth=11, class_weight='balanced')
model_dict['Decision Tree']  = DT

RT =  RandomForestClassifier( n_estimators=100, n_jobs=-1, class_weight='balanced')
model_dict['Random Forest']  = RT

In [None]:
def run_models(Xtrain, ytrain, Xtest, ytest, model_dict): 
    """Runs a list of models.

    Runs a list of models defined in model_dict.
    Tax


    Parameters
    ----------
    Xtrain : numpy.ndarray 
        training data 
    ytrain : numpy.ndarray 
        training target data 
    Xtest : numpy.ndarray 
        test data 
    ytrain : numpy.ndarray 
        training test data 

    """

    for name in model_dict:
        print(name)

        clf = model_dict[name]
        t0 = time()
        clf.fit(Xtrain, ytrain)
        train_time = time() - t0
        print("train time: {:0.2f}s".format( train_time))

        t0 = time()
        ypred = clf.predict(Xtest)
        test_time = time() - t0
        print('test time: {:0.2f}s \n'.format( test_time))
        accuracy_score(ytest, ypred)
        print(classification_report(ytest, ypred,target_names=target_names))
        print('-' * 80)

In [None]:
type(Xtrain)

In [7]:
run_models(Xtrain, ytrain, Xtest, ytest)

Logistic Regression
train time: 64.09s
test time: 0.03s 



NameError: name 'accuracy_score' is not defined