In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from src import find_duplicates

# Load data

In [255]:
filename = 'data/raw/targeting_model_data.csv' 
data = pd.read_csv(filename)

  interactivity=interactivity, compiler=compiler, result=result)


In [256]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 562 entries, FLOZVPMFT4626A to ibe9588SEUQO0831A
dtypes: float64(292), int64(129), object(141)
memory usage: 428.8+ MB


In [257]:
print('Number of columns: {}'.format(data.shape[1]))
print('Number of rows: {}'.format(data.shape[0]))

Number of columns: 562
Number of rows: 100000


In [258]:
# quick look at data 
data.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1270UUGLZ8167Q,ibe1271DCBOP1538T,ibe1271UEOXF6805V,ibe1273MMNAC5195E,ibe1273KQXUT2596F,ibe1274DFDFF5102Q,ibe1274BHMXG3244U,ibe1275QYWDP9371S,...,ibe9588RHJVI3838A,ibe9588PFUUV4361R,ibe9588KHKHQ6606C,ibe9588VBRCG7737V,ibe9588LZKVW3171Q,ibe9588XPLHJ3729U,ibe9588AWQKW9305L,ibe9588HVWQG4124G,ibe9588YWABE2309I,ibe9588SEUQO0831A
0,0,17.0,3,12B,3,9.0,E,01C,3,01C,...,0,0,0,0,0,0,0,0,0,1
1,0,1.0,3,11B,3,7.0,E,01C,3,01C,...,0,0,0,0,0,1,0,0,0,0
2,0,61.0,3,06X,3,12.0,E,03C,3,11C,...,0,0,0,0,0,0,0,0,0,0
3,0,13.0,3,12B,3,5.0,E,05C,3,07U,...,0,0,0,0,0,0,1,0,0,0
4,0,37.0,3,05X,3,7.0,E,05C,3,08C,...,0,0,0,0,0,1,0,0,0,0


In [259]:
# Composition of target data, this demonstrates imbalanced data. And thus accuracy alone is not a good metric for assessing performance of model. 
data['FLOZVPMFT4626A'].value_counts()

0    74528
1    25472
Name: FLOZVPMFT4626A, dtype: int64

# Clean up data

## Find and drop dulicate columns

In [260]:
duplicates = find_duplicates(data)

In [261]:
print('Number of duplicate columns dropped: {}'.format(len(duplicates)))

Number of duplicate columns dropped: 10


In [262]:
# data frame excluding dropped columns 
df = data.drop(columns=duplicates)

In [263]:
df.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1271DCBOP1538T,ibe1273MMNAC5195E,ibe1273KQXUT2596F,ibe1274DFDFF5102Q,ibe1275QYWDP9371S,ibe1280ORQKP6566Z,ibe1281AGNWU9303H,ibe1281VSZLA4159S,...,ibe9588RHJVI3838A,ibe9588PFUUV4361R,ibe9588KHKHQ6606C,ibe9588VBRCG7737V,ibe9588LZKVW3171Q,ibe9588XPLHJ3729U,ibe9588AWQKW9305L,ibe9588HVWQG4124G,ibe9588YWABE2309I,ibe9588SEUQO0831A
0,0,17.0,12B,9.0,E,01C,01C,35.0,09L,3,...,0,0,0,0,0,0,0,0,0,1
1,0,1.0,11B,7.0,E,01C,01C,11.0,04M,1,...,0,0,0,0,0,1,0,0,0,0
2,0,61.0,06X,12.0,E,03C,11C,38.0,12L,3,...,0,0,0,0,0,0,0,0,0,0
3,0,13.0,12B,5.0,E,05C,07U,14.0,05M,1,...,0,0,0,0,0,0,1,0,0,0
4,0,37.0,05X,7.0,E,05C,08C,33.0,11L,1,...,0,0,0,0,0,1,0,0,0,0


## For data type int64 or float64 drop columns with low variance
motivated by the Variance Threshold function https://scikit-learn.org/stable/modules/feature_selection.html

In [264]:
def find_low_variance(data, threshold = 0.3):
    """Finds columns with low variance. 
    
    Takes a dataframe as input. Creates a list of columns with low threshold. 
    These columns can then be dropped from original dataframe. 
    """
    low_variance_columns = []
    for column in data.columns: 
        if (data[column].dtype == 'float64') or (data[column].dtype == 'int64'): 
            if data[column].std() < threshold:
                low_variance_columns.append(column)
    return low_variance_columns
    
            
            

In [265]:
# data frame excluding dropped columns 
low_var_columns = find_low_variance(df)
df.drop(columns=low_var_columns, inplace=True)

In [266]:
print('Number of low variance columns dropped: {}'.format(len(low_var_columns)))

Number of low variance columns dropped: 288


In [267]:
df.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1271DCBOP1538T,ibe1273MMNAC5195E,ibe1273KQXUT2596F,ibe1274DFDFF5102Q,ibe1275QYWDP9371S,ibe1280ORQKP6566Z,ibe1281AGNWU9303H,ibe1281VSZLA4159S,...,ibe9154GOSYR7154P,ibe9180FFUYI1365V,ibe9181PWJGU8847L,ibe9350NHRIV6568X,ibe9351VNIYI1676Y,ibe9356VXVDJ5952B,ibe9358UBJWE4744M,ibe9509UGCNU4337M,ibe9514RWCHD8503K,ibe9557OYAAN8636H
0,0,17.0,12B,9.0,E,01C,01C,35.0,09L,3,...,C1,,,13.0,13.0,5,9.0,,,
1,0,1.0,11B,7.0,E,01C,01C,11.0,04M,1,...,A1,L1,,4.0,13.0,B,2.0,,2.0,
2,0,61.0,06X,12.0,E,03C,11C,38.0,12L,3,...,,,,25.0,5.0,1,15.0,,,
3,0,13.0,12B,5.0,E,05C,07U,14.0,05M,1,...,,M1,,14.0,5.0,7,7.0,,2.0,
4,0,37.0,05X,7.0,E,05C,08C,33.0,11L,1,...,B6,,,13.0,10.0,6,10.0,1.0,2.0,


## Drop Categorical values with low variance by converting to labels to dummy variables and summing the standard deviation 

In [None]:
def find_low_var_categories(data, threshold = 0.3):
    """Finds columns with low variance. 
    
    Takes a dataframe as input. Creates a list of columns with low threshold. 
    These columns can then be dropped from original dataframe. 
    """
    low_variance_columns = []
    for column in data.columns: 
        if (data[column].dtype != 'float64') or (data[column].dtype != 'int64'): 
            if pd.get_dummies(df[column]).std().sum() < threshold:
                low_variance_columns.append(column)
    return low_variance_columns
    

In [None]:
low_v_categories = find_low_var_categories(df)
df.drop(columns=low_v_categories, inplace=True)

In [None]:
print('Number of low variance category columns dropped: {}'.format(len(low_v_categories)))

In [None]:
df.head()

## Replace NaN values in numerical columns 

In [None]:
def replace_NaN(data):
    """Replace NaN values with the median from numerical column. 
    
    Takes a dataframe as input. Iterates through the dataframe for numeric types columns.   
    Replaces any NaN values with median value of column.  
    """

    for column in data.columns: 
        if (data[column].dtype == 'float64') or (data[column].dtype == 'int64'): 
            data[column].fillna(data[column].median(), inplace=True)

    return data

In [None]:
df= replace_NaN(df)

In [None]:
df.head()

## Find date like columns and covert to time stamp and then convert to int, this is done in order to create proper temporal spacing between events 

In [197]:
def time_like(df)
"""Finds time like columns. 

"""
    time_columns = []
    for column in df.columns:
        if (df[column].dtype == 'float64') or (df[column].dtype == 'int64'):
            if len(str(df[column][0])) >4:
                time_columns.append(column)
    return time_columns 

In [199]:
df[time_columns].head()

Unnamed: 0,ibe6532KHUBA7864D,ibe6533DZDLI9594P,ibe8434JPYKM2838C,ibe8562YXQYT0403F,ibe8563SBHJG6696A,ibe8579PIXYM2487A,ibe8588DPLHE7435F,ibe8592TOWAM1138U,ibe8614UPZWA9445N,ibe8643XQWMK2933S,ibe8840PMLTL7040B,ibe8841KKDQS9567K,ibe8842JMOXL7394P,ibe9042PFXFK2434Y,ibe9047QMSFT7844Y,ibe9052HKVXC1161K,ibe9057AEWDA2240T,ibe9152JHMZI9585O
0,20150513.0,20140426.0,2012.0,2009.0,2009.0,200609.0,1751.0,1982.0,20161.0,200700.0,24034.0,21646.5,20082.0,2009.0,20160106.0,2007.0,20160106.0,20140900.0
1,20141025.0,20140804.0,2011.0,2009.0,2009.0,201107.0,2957.0,1997.0,20153.0,201107.0,80785.0,21646.5,20082.0,2003.0,20160106.0,1998.0,20160106.0,20141000.0
2,20141215.0,20140804.0,2012.0,2009.0,2009.0,200609.0,1751.0,1982.0,20153.0,200700.0,24034.0,21646.5,20082.0,2009.0,20160106.0,2007.0,20160106.0,20140300.0
3,20150707.0,20140804.0,2012.0,2009.0,2009.0,200609.0,1751.0,1982.0,20154.0,201306.0,24034.0,21646.5,20082.0,2012.0,20160106.0,2007.0,20160106.0,20140100.0
4,20151015.0,20140804.0,2013.0,2009.0,2009.0,200811.0,910.0,1950.0,20154.0,200811.0,87135.0,21646.5,20082.0,2009.0,20160106.0,2007.0,20160106.0,20140300.0


In [245]:
# Visually inspecting items that are not potential time stamps
not_time_columns = ['ibe8588DPLHE7435F', 'ibe8840PMLTL7040B', 'ibe8841KKDQS9567K']
real_time_columns = [item for item in time_columns if item not in not_time_columns]

In [246]:
df[real_time_columns ].head()

Unnamed: 0,ibe6532KHUBA7864D,ibe6533DZDLI9594P,ibe8434JPYKM2838C,ibe8562YXQYT0403F,ibe8563SBHJG6696A,ibe8579PIXYM2487A,ibe8592TOWAM1138U,ibe8614UPZWA9445N,ibe8643XQWMK2933S,ibe8842JMOXL7394P,ibe9042PFXFK2434Y,ibe9047QMSFT7844Y,ibe9052HKVXC1161K,ibe9057AEWDA2240T,ibe9152JHMZI9585O
0,20150513.0,20140426.0,2012.0,2009.0,2009.0,200609.0,1982.0,20161.0,200700.0,20082.0,2009.0,20160106.0,2007.0,20160106.0,20140900.0
1,20141025.0,20140804.0,2011.0,2009.0,2009.0,201107.0,1997.0,20153.0,201107.0,20082.0,2003.0,20160106.0,1998.0,20160106.0,20141000.0
2,20141215.0,20140804.0,2012.0,2009.0,2009.0,200609.0,1982.0,20153.0,200700.0,20082.0,2009.0,20160106.0,2007.0,20160106.0,20140300.0
3,20150707.0,20140804.0,2012.0,2009.0,2009.0,200609.0,1982.0,20154.0,201306.0,20082.0,2012.0,20160106.0,2007.0,20160106.0,20140100.0
4,20151015.0,20140804.0,2013.0,2009.0,2009.0,200811.0,1950.0,20154.0,200811.0,20082.0,2009.0,20160106.0,2007.0,20160106.0,20140300.0


In [254]:
df['ibe8434JPYKM2838C'][0]/1900

1.0589473684210526

In [None]:
for column in df.columns: 
    df[column] = 

In [235]:
hard_date = pd.to_datetime(df['ibe8562YXQYT0403F'].astype(int), format = '%Y')

In [233]:
hard_date = pd.to_datetime(df['ibe6533DZDLI9594P'].astype(int), format = '%Y%m%d')

In [236]:
hard_date

0       2009-01-01
1       2009-01-01
2       2009-01-01
3       2009-01-01
4       2009-01-01
5       2003-01-01
6       2009-01-01
7       2009-01-01
8       2009-01-01
9       2009-01-01
10      2009-01-01
11      2009-01-01
12      2009-01-01
13      2009-01-01
14      2009-01-01
15      2004-01-01
16      2009-01-01
17      2009-01-01
18      2009-01-01
19      2009-01-01
20      2009-01-01
21      2009-01-01
22      2009-01-01
23      2009-01-01
24      2009-01-01
25      2002-01-01
26      2009-01-01
27      2009-01-01
28      2009-01-01
29      2009-01-01
           ...    
99970   2009-01-01
99971   2009-01-01
99972   2009-01-01
99973   2009-01-01
99974   2003-01-01
99975   2009-01-01
99976   2009-01-01
99977   2009-01-01
99978   2009-01-01
99979   2009-01-01
99980   2013-01-01
99981   2009-01-01
99982   2004-01-01
99983   2009-01-01
99984   2003-01-01
99985   2009-01-01
99986   2009-01-01
99987   2009-01-01
99988   2009-01-01
99989   2009-01-01
99990   2009-01-01
99991   2009

In [237]:
hard_date.astype(np.int64)

0        1230768000000000000
1        1230768000000000000
2        1230768000000000000
3        1230768000000000000
4        1230768000000000000
5        1041379200000000000
6        1230768000000000000
7        1230768000000000000
8        1230768000000000000
9        1230768000000000000
10       1230768000000000000
11       1230768000000000000
12       1230768000000000000
13       1230768000000000000
14       1230768000000000000
15       1072915200000000000
16       1230768000000000000
17       1230768000000000000
18       1230768000000000000
19       1230768000000000000
20       1230768000000000000
21       1230768000000000000
22       1230768000000000000
23       1230768000000000000
24       1230768000000000000
25       1009843200000000000
26       1230768000000000000
27       1230768000000000000
28       1230768000000000000
29       1230768000000000000
                ...         
99970    1230768000000000000
99971    1230768000000000000
99972    1230768000000000000
99973    12307

## Save cleaned up dataframe as csv

In [160]:
filename = 'data/interim/cleaned_columns.csv'
df.to_csv(path_or_buf=filename, index=False)

In [195]:
df = pd.read_csv(filename)

In [196]:
df.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1271DCBOP1538T,ibe1273MMNAC5195E,ibe1273KQXUT2596F,ibe1274DFDFF5102Q,ibe1275QYWDP9371S,ibe1280ORQKP6566Z,ibe1281AGNWU9303H,ibe1281VSZLA4159S,...,ibe9153QSXNN0648A,ibe9154GOSYR7154P,ibe9180FFUYI1365V,ibe9181PWJGU8847L,ibe9350NHRIV6568X,ibe9351VNIYI1676Y,ibe9356VXVDJ5952B,ibe9358UBJWE4744M,ibe9509UGCNU4337M,ibe9514RWCHD8503K
0,0,17.0,12B,9.0,E,01C,01C,35.0,09L,3,...,1,C1,,,13.0,13.0,5,9.0,1.0,1.0
1,0,1.0,11B,7.0,E,01C,01C,11.0,04M,1,...,0,A1,L1,,4.0,13.0,B,2.0,1.0,2.0
2,0,61.0,06X,12.0,E,03C,11C,38.0,12L,3,...,0,,,,25.0,5.0,1,15.0,1.0,1.0
3,0,13.0,12B,5.0,E,05C,07U,14.0,05M,1,...,0,,M1,,14.0,5.0,7,7.0,1.0,2.0
4,0,37.0,05X,7.0,E,05C,08C,33.0,11L,1,...,0,B6,,,13.0,10.0,6,10.0,1.0,2.0


In [123]:
df['ibe1273MMNAC5195E'].dtype

dtype('float64')

## Convert datetime to int 

In [None]:
foo_df = pd.DataFrame({'date':[20170601,20180601,20190601]})
foo_df

In [None]:
pd.get_dummies(foo_df)

In [None]:
hard_date = pd.to_datetime(foo_df['date'].astype(str), format = '%Y%m%d')

In [None]:
hard_date

In [None]:
hard_date.astype(np.int64)

# Load and preprocess data 

In [166]:
# Load data 
data =pd.get_dummies(df.iloc[:,1:50]).values
target = df.iloc[:,0].values

In [167]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target,
                                                random_state=2)
print(Xtrain.shape, Xtest.shape)

(75000, 599) (25000, 599)


In [168]:
# preprocess data 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Xtrain)  # Don't cheat - fit only on training data
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)  # apply same transformation to test data

In [169]:
# verify preprocess is working correctly 
print('Mean of random array: {}'.format(round(Xtrain[:,8].mean())))
print('Std of random array: {}'.format(round(Xtrain[:,8].std())))

Mean of random array: 0.0
Std of random array: 1.0


# Try Logistic Regression as a classifier 

In [170]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)



In [171]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, ypred)

0.76912

In [172]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(ytest, ypred))

[[17846   859]
 [ 4913  1382]]


## Try SGD 
https://scikit-learn.org/stable/tutorial/machine_learning_map/

https://scikit-learn.org/stable/modules/sgd.html#classification


In [173]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="huber", max_iter=20)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)

In [174]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

0.7474

# Regression Forrest 

In [175]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=11)
clf.fit(Xtrain, ytrain)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=11,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [176]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

0.75824

In [177]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
clf.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [178]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

0.77204

In [179]:
clf.feature_importances_

array([2.23433274e-02, 4.98401887e-02, 4.34703956e-02, 4.44615559e-03,
       4.87421697e-03, 6.09243018e-03, 6.23425980e-03, 3.39313078e-03,
       5.23181238e-03, 4.72705316e-03, 4.94208029e-02, 2.99107909e-03,
       1.43540116e-02, 3.51092452e-03, 3.12217396e-02, 3.37875411e-02,
       3.12180282e-02, 2.70634609e-02, 2.47330982e-02, 3.50226619e-02,
       1.79446127e-03, 3.30490104e-03, 8.86124065e-04, 1.80188930e-03,
       1.28516703e-03, 3.78183661e-03, 2.53432944e-03, 1.95008573e-03,
       2.05178909e-03, 1.27773918e-03, 2.24502940e-03, 2.22954373e-03,
       1.01324799e-03, 2.00023322e-03, 1.46543229e-03, 1.21428998e-03,
       1.24122576e-03, 4.26854919e-04, 1.56676157e-03, 9.87125261e-04,
       5.34411703e-04, 8.08011554e-04, 2.37444337e-03, 1.93866498e-03,
       2.05900056e-03, 1.35551473e-03, 3.84528483e-03, 1.39290058e-03,
       2.52362872e-03, 5.22891780e-03, 9.33458923e-04, 2.63617680e-03,
       1.98727997e-03, 8.37961597e-04, 9.92680205e-04, 1.84790669e-03,
      

# PCA

In [None]:
from sklearn.decomposition import PCA
X = pd.get_dummies(df.iloc[:,1:200].fillna(value=0))
clf = PCA(0.95) # keep 95% of variance
X_trans = clf.fit_transform(X)
print(X.shape)
print(X_trans.shape)

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=100)
clf.fit(Xtrain, ytrain)


In [None]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)