In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from src import find_empty_columns,  find_duplicates

ImportError: cannot import name 'find_empty_columns'

# Load raw data

In [None]:
filename = 'data/raw/targeting_model_data.csv' 
data = pd.read_csv(filename)

In [None]:
data.info()

In [None]:
print('Number of columns: {}'.format(data.shape[1]))
print('Number of rows: {}'.format(data.shape[0]))

In [None]:
# quick look at data 
data.head()

In [None]:
# Composition of target data, this demonstrates imbalanced data. And thus accuracy alone is not a good metric for assessing performance of model. 
data['FLOZVPMFT4626A'].value_counts()

# Clean up data

## Find and drop dulicate columns

In [None]:
duplicates = find_duplicates(data)

In [None]:
print('Number of duplicate columns dropped: {}'.format(len(duplicates)))

In [None]:
# data frame excluding dropped columns 
df = data.drop(columns=duplicates)

In [None]:
df.head()

## Drop columns that have > 80% missing values 

In [None]:
empty_columns = find_empty_columns(df)

In [None]:
print('Number of mostly empty columns dropped: {}'.format(len(empty_columns)))

In [None]:
# data frame excluding dropped columns 
df = df.drop(columns=empty_columns)

In [None]:
df.head()

## For data type int64 or float64 drop columns with low variance
motivated by the Variance Threshold function https://scikit-learn.org/stable/modules/feature_selection.html

In [None]:
# data frame excluding dropped columns 
low_var_columns = find_low_variance(df)
df.drop(columns=low_var_columns, inplace=True)

In [None]:
print('Number of low variance columns dropped: {}'.format(len(low_var_columns)))

In [None]:
df.head()

## Drop Categorical values with low variance by converting to labels to dummy variables and summing the standard deviation 

In [None]:
low_v_categories = find_low_var_categories(df)
df.drop(columns=low_v_categories, inplace=True)

In [None]:
print('Number of low variance category columns dropped: {}'.format(len(low_v_categories)))

In [None]:
df.head()

## Replace NaN values in numerical columns with median value of series 

In [None]:
df= replace_NaN(df)

In [None]:
df.head()

## Find date like columns

In [None]:
time_columns = time_like(df)

In [None]:
df[time_columns].head()

In [None]:
# Visually inspecting items that are not potential time stamps
not_time_columns = ['ibe8588DPLHE7435F', 'ibe8840PMLTL7040B']
real_time_columns = [item for item in time_columns if item not in not_time_columns]

In [None]:
df = convert_to_timestamp(df, real_time_columns)

In [None]:
df[ real_time_columns].head()

## Save cleaned up dataframe as csv

In [None]:
filename = 'data/interim/cleaned_columns.csv'
df.to_csv(path_or_buf=filename, index=False)

# Load cleaned data 

In [4]:
filename = 'data/interim/cleaned_columns.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1271DCBOP1538T,ibe1273MMNAC5195E,ibe1274DFDFF5102Q,ibe1275QYWDP9371S,ibe1280ORQKP6566Z,ibe1281AGNWU9303H,ibe1281VSZLA4159S,ibe2062AHFGH0763Q,...,ibe9153QSXNN0648A,ibe9154GOSYR7154P,ibe9180FFUYI1365V,ibe9181PWJGU8847L,ibe9350NHRIV6568X,ibe9351VNIYI1676Y,ibe9356VXVDJ5952B,ibe9358UBJWE4744M,ibe9509UGCNU4337M,ibe9514RWCHD8503K
0,0,17.0,12B,9.0,01C,01C,35.0,09L,3,1,...,1,C1,,,13.0,13.0,5,9.0,1.0,1.0
1,0,1.0,11B,7.0,01C,01C,11.0,04M,1,0,...,0,A1,L1,,4.0,13.0,B,2.0,1.0,2.0
2,0,61.0,06X,12.0,03C,11C,38.0,12L,3,0,...,0,,,,25.0,5.0,1,15.0,1.0,1.0
3,0,13.0,12B,5.0,05C,07U,14.0,05M,1,0,...,0,,M1,,14.0,5.0,7,7.0,1.0,2.0
4,0,37.0,05X,7.0,05C,08C,33.0,11L,1,0,...,0,B6,,,13.0,10.0,6,10.0,1.0,2.0


# Preprocess data

In [5]:
# Load data 
X =pd.get_dummies(df.iloc[:,1:]).values
target = df.iloc[:,0].values

In [None]:
# split the data
Xtrain, Xtest, ytrain, ytest = train_test_split(X, target, random_state=2)
print(Xtrain.shape, Xtest.shape)

In [None]:
# preprocess data 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Xtrain)  # fit only on training data
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)  # apply same transformation to test data

In [None]:
# verify preprocess is working correctly 
print('Mean of random array: {}'.format(round(Xtrain[:,8].mean())))
print('Std of random array: {}'.format(round(Xtrain[:,8].std())))

# Models 

In [None]:
# label data
target_names = ['0', '1']

## Logistic Regression 

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)

In [None]:
accuracy_score(ytest, ypred)

In [None]:

print(classification_report(ytest, ypred,target_names=target_names))

## Support Vector Machine

## Stochastic Gradient Descent (SGD)
https://scikit-learn.org/stable/tutorial/machine_learning_map/

https://scikit-learn.org/stable/modules/sgd.html#classification


In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="huber", max_iter=20)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)

In [None]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

In [None]:
print(classification_report(ytest, ypred,target_names=target_names))

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="huber", max_iter=20, class_weight='balanced')
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)
print(classification_report(ytest, ypred,target_names=target_names))

## Regression Forest 

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=11)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)
print(classification_report(ytest, ypred,target_names=target_names))

### Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier( n_estimators=100, n_jobs=-1, class_weight='balanced')
clf.fit(Xtrain, ytrain)

In [None]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

In [None]:
print(classification_report(ytest, ypred,target_names=target_names))

### Boosted Trees 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()

In [None]:
clf.fit(Xtrain, ytrain)

In [None]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

In [None]:
print(classification_report(ytest, ypred,target_names=target_names))

## Guasian Niave Bayes 

In [None]:
from sklearn.naive_bayes import GaussianNB
# train the model
clf = GaussianNB()
clf.fit(Xtrain, ytrain)

In [None]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

In [None]:
print(classification_report(ytest, ypred,target_names=target_names))