In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from src import find_duplicates

# Load data

In [2]:
filename = 'data/raw/targeting_model_data.csv' 
data = pd.read_csv(filename)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# according to panda Columns 49, 191, 476 have mixed data types.

In [None]:
data.info()

In [None]:
print('Number of columns: {}'.format(data.shape[1]))
print('Number of rows: {}'.format(data.shape[0]))

In [None]:
# quick look at data 
data.head()

# Clean up data

## Find and drop dulicate columns

In [3]:
duplicates = find_duplicates(data)
duplicates

['ibe2061FCHWX2406S',
 'ibe2532YKXRG5625F',
 'ibe8603RPMCM3066K',
 'ibe9153OOCJX2004W',
 'ibe1270UUGLZ8167Q',
 'ibe1271UEOXF6805V',
 'ibe1274BHMXG3244U',
 'ibe1275SCSDK5796X',
 'ibe1280NVNNB6226D',
 'ibe7602USGYQ4120V']

In [4]:
print('Number of duplicate columns dropped: {}'.format(len(duplicates)))

Number of duplicate columns dropped: 10


In [5]:
# data frame excluding dropped columns 
df = data.drop(columns=duplicates)

In [6]:
df.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1271DCBOP1538T,ibe1273MMNAC5195E,ibe1273KQXUT2596F,ibe1274DFDFF5102Q,ibe1275QYWDP9371S,ibe1280ORQKP6566Z,ibe1281AGNWU9303H,ibe1281VSZLA4159S,...,ibe9588RHJVI3838A,ibe9588PFUUV4361R,ibe9588KHKHQ6606C,ibe9588VBRCG7737V,ibe9588LZKVW3171Q,ibe9588XPLHJ3729U,ibe9588AWQKW9305L,ibe9588HVWQG4124G,ibe9588YWABE2309I,ibe9588SEUQO0831A
0,0,17.0,12B,9.0,E,01C,01C,35.0,09L,3,...,0,0,0,0,0,0,0,0,0,1
1,0,1.0,11B,7.0,E,01C,01C,11.0,04M,1,...,0,0,0,0,0,1,0,0,0,0
2,0,61.0,06X,12.0,E,03C,11C,38.0,12L,3,...,0,0,0,0,0,0,0,0,0,0
3,0,13.0,12B,5.0,E,05C,07U,14.0,05M,1,...,0,0,0,0,0,0,1,0,0,0
4,0,37.0,05X,7.0,E,05C,08C,33.0,11L,1,...,0,0,0,0,0,1,0,0,0,0


## Convert categorical strings to values 

In [None]:
pd.get_dummies(df.iloc[:,1:30])

In [None]:
pd.get_dummies(df).shape

## Convert datetime to int 

In [None]:
foo_df = pd.DataFrame({'date':[20170601,20180601,20190601]})
foo_df

In [None]:
pd.get_dummies(foo_df)

In [None]:
hard_date = pd.to_datetime(foo_df['date'].astype(str), format = '%Y%m%d')

In [None]:
hard_date

In [None]:
hard_date.astype(np.int64)

# Load and preprocess data 

In [7]:
# Load data 
data =pd.get_dummies(df.iloc[:,1:30].fillna(value=0)).values
target = df.iloc[:,0].values

In [8]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(data, target,
                                                random_state=2)
print(Xtrain.shape, Xtest.shape)

(75000, 95) (25000, 95)


In [None]:
# preprocess data 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Xtrain)  # Don't cheat - fit only on training data
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)  # apply same transformation to test data

In [None]:
# verify preprocess is working correctly 
print('Mean of random array: {}'.format(round(Xtrain[:,8].mean())))
print('Std of random array: {}'.format(round(Xtrain[:,8].std())))

# Try Logistic Regression as a classifier 

In [25]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)



In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, ypred)

0.7666

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(ytest, ypred))

## Try SGD 
https://scikit-learn.org/stable/tutorial/machine_learning_map/

https://scikit-learn.org/stable/modules/sgd.html#classification


In [23]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="huber", max_iter=20)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)

In [24]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

0.35816

# Regression Forrest 

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=11)
clf.fit(Xtrain, ytrain)


In [None]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
clf.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [12]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

0.74456

In [None]:
clf.feature_importances_

In [None]:
pd.get_dummies(df.iloc[:,1:100].fillna(value=0))

# PCA

In [None]:
from sklearn.decomposition import PCA
X = pd.get_dummies(df.iloc[:,1:200].fillna(value=0))
clf = PCA(0.95) # keep 95% of variance
X_trans = clf.fit_transform(X)
print(X.shape)
print(X_trans.shape)

## KNN

In [21]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=100)
clf.fit(Xtrain, ytrain)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=100, p=2,
           weights='uniform')

In [22]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

0.7666