In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from src import find_duplicates

# Load data

In [84]:
filename = 'data/raw/targeting_model_data.csv' 
data = pd.read_csv(filename)

  interactivity=interactivity, compiler=compiler, result=result)


In [85]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 562 entries, FLOZVPMFT4626A to ibe9588SEUQO0831A
dtypes: float64(292), int64(129), object(141)
memory usage: 428.8+ MB


In [86]:
print('Number of columns: {}'.format(data.shape[1]))
print('Number of rows: {}'.format(data.shape[0]))

Number of columns: 562
Number of rows: 100000


In [87]:
# quick look at data 
data.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1270UUGLZ8167Q,ibe1271DCBOP1538T,ibe1271UEOXF6805V,ibe1273MMNAC5195E,ibe1273KQXUT2596F,ibe1274DFDFF5102Q,ibe1274BHMXG3244U,ibe1275QYWDP9371S,...,ibe9588RHJVI3838A,ibe9588PFUUV4361R,ibe9588KHKHQ6606C,ibe9588VBRCG7737V,ibe9588LZKVW3171Q,ibe9588XPLHJ3729U,ibe9588AWQKW9305L,ibe9588HVWQG4124G,ibe9588YWABE2309I,ibe9588SEUQO0831A
0,0,17.0,3,12B,3,9.0,E,01C,3,01C,...,0,0,0,0,0,0,0,0,0,1
1,0,1.0,3,11B,3,7.0,E,01C,3,01C,...,0,0,0,0,0,1,0,0,0,0
2,0,61.0,3,06X,3,12.0,E,03C,3,11C,...,0,0,0,0,0,0,0,0,0,0
3,0,13.0,3,12B,3,5.0,E,05C,3,07U,...,0,0,0,0,0,0,1,0,0,0
4,0,37.0,3,05X,3,7.0,E,05C,3,08C,...,0,0,0,0,0,1,0,0,0,0


In [88]:
# Composition of target data, this demonstrates imbalanced data. And thus accuracy alone is not a good metric for assessing performance of model. 
data['FLOZVPMFT4626A'].value_counts()

0    74528
1    25472
Name: FLOZVPMFT4626A, dtype: int64

# Clean up data

## Find and drop dulicate columns

In [89]:
duplicates = find_duplicates(data)

In [90]:
print('Number of duplicate columns dropped: {}'.format(len(duplicates)))

Number of duplicate columns dropped: 10


In [91]:
# data frame excluding dropped columns 
df = data.drop(columns=duplicates)

In [92]:
df.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1271DCBOP1538T,ibe1273MMNAC5195E,ibe1273KQXUT2596F,ibe1274DFDFF5102Q,ibe1275QYWDP9371S,ibe1280ORQKP6566Z,ibe1281AGNWU9303H,ibe1281VSZLA4159S,...,ibe9588RHJVI3838A,ibe9588PFUUV4361R,ibe9588KHKHQ6606C,ibe9588VBRCG7737V,ibe9588LZKVW3171Q,ibe9588XPLHJ3729U,ibe9588AWQKW9305L,ibe9588HVWQG4124G,ibe9588YWABE2309I,ibe9588SEUQO0831A
0,0,17.0,12B,9.0,E,01C,01C,35.0,09L,3,...,0,0,0,0,0,0,0,0,0,1
1,0,1.0,11B,7.0,E,01C,01C,11.0,04M,1,...,0,0,0,0,0,1,0,0,0,0
2,0,61.0,06X,12.0,E,03C,11C,38.0,12L,3,...,0,0,0,0,0,0,0,0,0,0
3,0,13.0,12B,5.0,E,05C,07U,14.0,05M,1,...,0,0,0,0,0,0,1,0,0,0
4,0,37.0,05X,7.0,E,05C,08C,33.0,11L,1,...,0,0,0,0,0,1,0,0,0,0


## Drop columns that have > 80% missing values 

In [93]:
def find_empty_columns(df, threshold = 0.80):
    empty_columns = []
    for column in df:
        if df[column].isna().sum()/len(df) > threshold:
            empty_columns.append(column)
    return empty_columns
        

In [94]:
empty_columns = find_empty_columns(df)

In [96]:
print('Number of mostly empty columns dropped: {}'.format(len(empty_columns)))

Number of mostly empty columns dropped: 199


In [98]:
# data frame excluding dropped columns 
df = df.drop(columns=empty_columns)

In [99]:
df.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1271DCBOP1538T,ibe1273MMNAC5195E,ibe1273KQXUT2596F,ibe1274DFDFF5102Q,ibe1275QYWDP9371S,ibe1280ORQKP6566Z,ibe1281AGNWU9303H,ibe1281VSZLA4159S,...,ibe9588RHJVI3838A,ibe9588PFUUV4361R,ibe9588KHKHQ6606C,ibe9588VBRCG7737V,ibe9588LZKVW3171Q,ibe9588XPLHJ3729U,ibe9588AWQKW9305L,ibe9588HVWQG4124G,ibe9588YWABE2309I,ibe9588SEUQO0831A
0,0,17.0,12B,9.0,E,01C,01C,35.0,09L,3,...,0,0,0,0,0,0,0,0,0,1
1,0,1.0,11B,7.0,E,01C,01C,11.0,04M,1,...,0,0,0,0,0,1,0,0,0,0
2,0,61.0,06X,12.0,E,03C,11C,38.0,12L,3,...,0,0,0,0,0,0,0,0,0,0
3,0,13.0,12B,5.0,E,05C,07U,14.0,05M,1,...,0,0,0,0,0,0,1,0,0,0
4,0,37.0,05X,7.0,E,05C,08C,33.0,11L,1,...,0,0,0,0,0,1,0,0,0,0


## For data type int64 or float64 drop columns with low variance
motivated by the Variance Threshold function https://scikit-learn.org/stable/modules/feature_selection.html

In [100]:
def find_low_variance(data, threshold = 0.18):
    """Finds columns with low variance. 
    
    Takes a dataframe as input. Creates a list of columns with low threshold. 
    These columns can then be dropped from original dataframe. 
    """
    low_variance_columns = []
    for column in data.columns: 
        if (data[column].dtype == 'float64') or (data[column].dtype == 'int64'): 
            if data[column].var() < threshold:
                low_variance_columns.append(column)
    return low_variance_columns

In [101]:
# data frame excluding dropped columns 
low_var_columns = find_low_variance(df)
df.drop(columns=low_var_columns, inplace=True)

In [102]:
print('Number of low variance columns dropped: {}'.format(len(low_var_columns)))

Number of low variance columns dropped: 159


In [103]:
df.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1271DCBOP1538T,ibe1273MMNAC5195E,ibe1273KQXUT2596F,ibe1274DFDFF5102Q,ibe1275QYWDP9371S,ibe1280ORQKP6566Z,ibe1281AGNWU9303H,ibe1281VSZLA4159S,...,ibe9153QSXNN0648A,ibe9154GOSYR7154P,ibe9180FFUYI1365V,ibe9181PWJGU8847L,ibe9350NHRIV6568X,ibe9351VNIYI1676Y,ibe9356VXVDJ5952B,ibe9358UBJWE4744M,ibe9509UGCNU4337M,ibe9514RWCHD8503K
0,0,17.0,12B,9.0,E,01C,01C,35.0,09L,3,...,1,C1,,,13.0,13.0,5,9.0,,
1,0,1.0,11B,7.0,E,01C,01C,11.0,04M,1,...,0,A1,L1,,4.0,13.0,B,2.0,,2.0
2,0,61.0,06X,12.0,E,03C,11C,38.0,12L,3,...,0,,,,25.0,5.0,1,15.0,,
3,0,13.0,12B,5.0,E,05C,07U,14.0,05M,1,...,0,,M1,,14.0,5.0,7,7.0,,2.0
4,0,37.0,05X,7.0,E,05C,08C,33.0,11L,1,...,0,B6,,,13.0,10.0,6,10.0,1.0,2.0


## Drop Categorical values with low variance by converting to labels to dummy variables and summing the standard deviation 

In [106]:
def find_low_var_categories(data, threshold = 0.18):
    """Finds columns with low variance. 
    
    Takes a dataframe as input. Creates a list of columns with low threshold. 
    These columns can then be dropped from original dataframe. 
    """
    low_variance_columns = []
    for column in data.columns: 
        if data[column].dtype == 'O': 
            if pd.get_dummies(df[column]).var().sum() < threshold:
                low_variance_columns.append(column)
    return low_variance_columns
    

In [107]:
low_v_categories = find_low_var_categories(df)
df.drop(columns=low_v_categories, inplace=True)

In [108]:
print('Number of low variance category columns dropped: {}'.format(len(low_v_categories)))

Number of low variance category columns dropped: 7


In [109]:
df.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1271DCBOP1538T,ibe1273MMNAC5195E,ibe1274DFDFF5102Q,ibe1275QYWDP9371S,ibe1280ORQKP6566Z,ibe1281AGNWU9303H,ibe1281VSZLA4159S,ibe2062AHFGH0763Q,...,ibe9153QSXNN0648A,ibe9154GOSYR7154P,ibe9180FFUYI1365V,ibe9181PWJGU8847L,ibe9350NHRIV6568X,ibe9351VNIYI1676Y,ibe9356VXVDJ5952B,ibe9358UBJWE4744M,ibe9509UGCNU4337M,ibe9514RWCHD8503K
0,0,17.0,12B,9.0,01C,01C,35.0,09L,3,1,...,1,C1,,,13.0,13.0,5,9.0,,
1,0,1.0,11B,7.0,01C,01C,11.0,04M,1,0,...,0,A1,L1,,4.0,13.0,B,2.0,,2.0
2,0,61.0,06X,12.0,03C,11C,38.0,12L,3,0,...,0,,,,25.0,5.0,1,15.0,,
3,0,13.0,12B,5.0,05C,07U,14.0,05M,1,0,...,0,,M1,,14.0,5.0,7,7.0,,2.0
4,0,37.0,05X,7.0,05C,08C,33.0,11L,1,0,...,0,B6,,,13.0,10.0,6,10.0,1.0,2.0


## Replace NaN values in numerical columns with median value of series 

In [110]:
def replace_NaN(data):
    """Replace NaN values with the median from numerical column. 
    
    Takes a dataframe as input. Iterates through the dataframe for numeric types columns.   
    Replaces any NaN values with median value of column.  
    """

    for column in data.columns: 
        if data[column].dtype != 'O' : 
            data[column].fillna(data[column].median(), inplace=True)

    return data

In [111]:
df= replace_NaN(df)

In [112]:
df.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1271DCBOP1538T,ibe1273MMNAC5195E,ibe1274DFDFF5102Q,ibe1275QYWDP9371S,ibe1280ORQKP6566Z,ibe1281AGNWU9303H,ibe1281VSZLA4159S,ibe2062AHFGH0763Q,...,ibe9153QSXNN0648A,ibe9154GOSYR7154P,ibe9180FFUYI1365V,ibe9181PWJGU8847L,ibe9350NHRIV6568X,ibe9351VNIYI1676Y,ibe9356VXVDJ5952B,ibe9358UBJWE4744M,ibe9509UGCNU4337M,ibe9514RWCHD8503K
0,0,17.0,12B,9.0,01C,01C,35.0,09L,3,1,...,1,C1,,,13.0,13.0,5,9.0,1.0,1.0
1,0,1.0,11B,7.0,01C,01C,11.0,04M,1,0,...,0,A1,L1,,4.0,13.0,B,2.0,1.0,2.0
2,0,61.0,06X,12.0,03C,11C,38.0,12L,3,0,...,0,,,,25.0,5.0,1,15.0,1.0,1.0
3,0,13.0,12B,5.0,05C,07U,14.0,05M,1,0,...,0,,M1,,14.0,5.0,7,7.0,1.0,2.0
4,0,37.0,05X,7.0,05C,08C,33.0,11L,1,0,...,0,B6,,,13.0,10.0,6,10.0,1.0,2.0


## Find date like columns

In [113]:
def time_like(df):
    """Finds time like columns. 

    """
    time_columns = []
    for column in df:
        if df[column].dtype != 'O':
            if df[column].mean()/1000> 1:
                time_columns.append(column)
    return time_columns

In [114]:
time_columns = time_like(df)

In [115]:
df[time_columns].head()

Unnamed: 0,ibe6532KHUBA7864D,ibe6533DZDLI9594P,ibe8434JPYKM2838C,ibe8579PIXYM2487A,ibe8588DPLHE7435F,ibe8592TOWAM1138U,ibe8614UPZWA9445N,ibe8643XQWMK2933S,ibe8840PMLTL7040B,ibe9042PFXFK2434Y,ibe9047QMSFT7844Y,ibe9052HKVXC1161K,ibe9057AEWDA2240T,ibe9152JHMZI9585O
0,20150513.0,20140426.0,2012.0,200609.0,1751.0,1982.0,20161.0,200700.0,24034.0,2009.0,20160106.0,2007.0,20160106.0,20140900.0
1,20141025.0,20140804.0,2011.0,201107.0,2957.0,1997.0,20153.0,201107.0,80785.0,2003.0,20160106.0,1998.0,20160106.0,20141000.0
2,20141215.0,20140804.0,2012.0,200609.0,1751.0,1982.0,20153.0,200700.0,24034.0,2009.0,20160106.0,2007.0,20160106.0,20140300.0
3,20150707.0,20140804.0,2012.0,200609.0,1751.0,1982.0,20154.0,201306.0,24034.0,2012.0,20160106.0,2007.0,20160106.0,20140100.0
4,20151015.0,20140804.0,2013.0,200811.0,910.0,1950.0,20154.0,200811.0,87135.0,2009.0,20160106.0,2007.0,20160106.0,20140300.0


In [116]:
# Visually inspecting items that are not potential time stamps
not_time_columns = ['ibe8588DPLHE7435F', 'ibe8840PMLTL7040B']
real_time_columns = [item for item in time_columns if item not in not_time_columns]

In [132]:
def convert_to_timestamp(df, time_columns):
    for column in df[time_columns]: 
        # convert year 
        if len(str(int(df[column][0]))) == 4:
            # convert to time stamp and then to int 
            df[column] = pd.to_datetime(df[column].astype(int), format = '%Y').astype(np.int64)
            
        elif len(str(int(df[column][0]))) == 5:
            # convert to time stamp and then to int 
            df[column] = pd.to_datetime(df[column].astype(int), format = '%Y%m').astype(np.int64)
        
        # strip the year 
        elif len(str(int(df[column][0]))) == 6:
            # truncate to year month and convert to time stamp, then to int 
            df[column] = df[column].astype(int).astype(str).apply(lambda x: x[:4])
            df[column] = pd.to_datetime(df[column], format = '%Y').astype(np.int64)
            
        
    return df 

In [133]:
df = convert_to_timestamp(df, real_time_columns)

In [136]:
df[ real_time_columns].head()

Unnamed: 0,ibe6532KHUBA7864D,ibe6533DZDLI9594P,ibe8434JPYKM2838C,ibe8579PIXYM2487A,ibe8592TOWAM1138U,ibe8614UPZWA9445N,ibe8643XQWMK2933S,ibe9042PFXFK2434Y,ibe9047QMSFT7844Y,ibe9052HKVXC1161K,ibe9057AEWDA2240T,ibe9152JHMZI9585O
0,1430438400000000000,1396310400000000000,1325376000000000000,1136073600000000000,378691200000000000,1451606400000000000,1167609600000000000,1230768000000000000,1451606400000000000,1167609600000000000,1451606400000000000,1409529600000000000
1,1412121600000000000,1406851200000000000,1293840000000000000,1293840000000000000,852076800000000000,1425168000000000000,1293840000000000000,1041379200000000000,1451606400000000000,883612800000000000,1451606400000000000,1412121600000000000
2,1417392000000000000,1406851200000000000,1325376000000000000,1136073600000000000,378691200000000000,1425168000000000000,1167609600000000000,1230768000000000000,1451606400000000000,1167609600000000000,1451606400000000000,1393632000000000000
3,1435708800000000000,1406851200000000000,1325376000000000000,1136073600000000000,378691200000000000,1427846400000000000,1356998400000000000,1325376000000000000,1451606400000000000,1167609600000000000,1451606400000000000,1388534400000000000
4,1443657600000000000,1406851200000000000,1356998400000000000,1199145600000000000,-631152000000000000,1427846400000000000,1199145600000000000,1230768000000000000,1451606400000000000,1167609600000000000,1451606400000000000,1393632000000000000


## Save cleaned up dataframe as csv

In [137]:
filename = 'data/interim/cleaned_columns.csv'
df.to_csv(path_or_buf=filename, index=False)

# Load cleaned data 

In [138]:
filename = 'data/interim/cleaned_columns.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,FLOZVPMFT4626A,ibe1270YRFHJ3350O,ibe1271DCBOP1538T,ibe1273MMNAC5195E,ibe1274DFDFF5102Q,ibe1275QYWDP9371S,ibe1280ORQKP6566Z,ibe1281AGNWU9303H,ibe1281VSZLA4159S,ibe2062AHFGH0763Q,...,ibe9153QSXNN0648A,ibe9154GOSYR7154P,ibe9180FFUYI1365V,ibe9181PWJGU8847L,ibe9350NHRIV6568X,ibe9351VNIYI1676Y,ibe9356VXVDJ5952B,ibe9358UBJWE4744M,ibe9509UGCNU4337M,ibe9514RWCHD8503K
0,0,17.0,12B,9.0,01C,01C,35.0,09L,3,1,...,1,C1,,,13.0,13.0,5,9.0,1.0,1.0
1,0,1.0,11B,7.0,01C,01C,11.0,04M,1,0,...,0,A1,L1,,4.0,13.0,B,2.0,1.0,2.0
2,0,61.0,06X,12.0,03C,11C,38.0,12L,3,0,...,0,,,,25.0,5.0,1,15.0,1.0,1.0
3,0,13.0,12B,5.0,05C,07U,14.0,05M,1,0,...,0,,M1,,14.0,5.0,7,7.0,1.0,2.0
4,0,37.0,05X,7.0,05C,08C,33.0,11L,1,0,...,0,B6,,,13.0,10.0,6,10.0,1.0,2.0


# Preprocess data

In [144]:
# Load data 
X =pd.get_dummies(df.iloc[:,1:len(df.columns)]).values
target = df.iloc[:,0].values

In [145]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, target,
                                                random_state=2)
print(Xtrain.shape, Xtest.shape)

(75000, 1287) (25000, 1287)


In [146]:
# preprocess data 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Xtrain)  # fit only on training data
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)  # apply same transformation to test data

In [147]:
# verify preprocess is working correctly 
print('Mean of random array: {}'.format(round(Xtrain[:,8].mean())))
print('Std of random array: {}'.format(round(Xtrain[:,8].std())))

Mean of random array: 0.0
Std of random array: 1.0


# Try Logistic Regression as a classifier 

In [148]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)



In [149]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, ypred)

0.77612

In [155]:
from sklearn.metrics import classification_report
target_names = ['0', '1']
print(classification_report(ytest, ypred,target_names=target_names))

              precision    recall  f1-score   support

           0       0.80      0.93      0.86     18705
           1       0.61      0.32      0.41      6295

   micro avg       0.78      0.78      0.78     25000
   macro avg       0.70      0.62      0.64     25000
weighted avg       0.75      0.78      0.75     25000



## Try SGD 
https://scikit-learn.org/stable/tutorial/machine_learning_map/

https://scikit-learn.org/stable/modules/sgd.html#classification


In [157]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="huber", max_iter=20)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)

In [158]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

0.72468

In [159]:
print(classification_report(ytest, ypred,target_names=target_names))

              precision    recall  f1-score   support

           0       0.78      0.87      0.83     18705
           1       0.43      0.28      0.34      6295

   micro avg       0.72      0.72      0.72     25000
   macro avg       0.61      0.58      0.58     25000
weighted avg       0.69      0.72      0.70     25000



# Regression Forrest 

In [160]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=11)
clf.fit(Xtrain, ytrain)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=11,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [161]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

0.75304

In [162]:
print(classification_report(ytest, ypred,target_names=target_names))

              precision    recall  f1-score   support

           0       0.79      0.91      0.85     18705
           1       0.52      0.30      0.38      6295

   micro avg       0.75      0.75      0.75     25000
   macro avg       0.65      0.60      0.61     25000
weighted avg       0.72      0.75      0.73     25000



In [163]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
clf.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [164]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

0.78076

In [165]:
print(classification_report(ytest, ypred,target_names=target_names))

              precision    recall  f1-score   support

           0       0.79      0.96      0.87     18705
           1       0.69      0.24      0.35      6295

   micro avg       0.78      0.78      0.78     25000
   macro avg       0.74      0.60      0.61     25000
weighted avg       0.76      0.78      0.74     25000



# Boosted Trees 

In [166]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                subsample=.8, max_features=.5)


In [None]:
clf.fit(Xtrain, ytrain)

In [None]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)

In [None]:
print(classification_report(ytest, ypred,target_names=target_names))

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=10)
clf.fit(Xtrain, ytrain)


In [None]:
ypred = clf.predict(Xtest)
accuracy_score(ytest, ypred)