In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder , normalize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from imblearn.over_sampling import SMOTE

In [42]:
train = pd.read_csv("click_data_train_sample_2.csv" , index_col=[0])

In [44]:
train.head()

Unnamed: 0,display_id,ad_id,clicked,uuid,document_id_x,timestamp,platform,geo_location,source_id,publisher_id,publish_time,document_id_y,campaign_id,advertiser_id
0,2,125211,0,79a85fa78311b9,1794259,81,2,US>CA>807,93.0,407.0,2016-06-14 00:00:00,876083,16054,772
1,2,156535,0,79a85fa78311b9,1794259,81,2,US>CA>807,93.0,407.0,2016-06-14 00:00:00,630534,16636,380
2,2,169564,0,79a85fa78311b9,1794259,81,2,US>CA>807,93.0,407.0,2016-06-14 00:00:00,1394819,20109,640
3,37,169564,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,93.0,407.0,2016-06-13 17:00:00,1394819,20109,640
4,1743,169564,0,70e5221764a830,1777537,121119,2,US>CA>862,93.0,407.0,2016-06-13 17:00:00,1394819,20109,640


In [45]:
train.isna().sum()

display_id       0
ad_id            0
clicked          0
uuid             0
document_id_x    0
timestamp        0
platform         0
geo_location     0
source_id        0
publisher_id     0
publish_time     0
document_id_y    0
campaign_id      0
advertiser_id    0
dtype: int64

In [46]:
train["clicked"].value_counts()

0    6885
1    1656
Name: clicked, dtype: int64

In [47]:
# is_NaN = train.isnull()
# row_has_NaN = is_NaN.any(axis=1)
# df[row_has_NaN]

In [48]:
def add_timestamp (row):
    return datetime.timestamp(datetime.strptime(str(row["publish_time"]) , '%Y-%m-%d %H:%M:%S'))

In [49]:
def get_country (row , le) :
    country = str(row["geo_location"]).split('>')[0]
    inp = list()
    inp.append(country)
    return le.transform(inp)[0]

In [9]:
def get_state (row):
    geos = str(row["geo_location"]).split('>')
    if len(geos) == 1 :
        geos.append(np.nan)
    return geos[1]

In [10]:
def get_categorical_uuid (row , le):
    inp = list()
    inp.append(row["uuid"])
    return le.transform(inp)[0]

In [11]:
def add_new_features (df):
    '''
    this function adds extra features to our dataset the features are as follows :
    1) doc_timestamp : publish time of the document 
    2) country : the country of where the user is clicking in the ad 
    3) state : the state of where the user is clicking in the ad
    4) uuid_c : the number that the uuid was mapped to 
    '''
    df["doc_timestamp"] = df.apply(add_timestamp , axis = 1)
    
    # creating label encoder for mapping each country to a number 
    vals = train.apply(lambda row : str(row["geo_location"]).split('>')[0] , axis = 1)
    le_country = LabelEncoder()
    u_vals_country = vals.unique() 
    le_country.fit(u_vals_country)
    df["country"] = df.apply(lambda row : get_country(row , le_country) , axis = 1)
    
    # deleted for now
    #df["state"] = df.apply(get_state , axis = 1)
    
    #creating categorical encoder in order to map each uuid to a class number 
    le_uuid = LabelEncoder()
    u_vals_uuid = df["uuid"].unique() 
    le_uuid.fit(u_vals_uuid)
    df["uuid_c"] = df.apply(lambda row : get_categorical_uuid(row , le_uuid) , axis = 1)
    return df

In [76]:
def clean_dataframe (df):
    cols_to_remove = ["publish_time" , "geo_location" , "uuid"]
    df = df.drop(cols_to_remove , axis = 1)
    X = df.drop("clicked" ,  axis = 1)
    y = df["clicked"]
    X_norm = normalize(X)
    return X , y 

In [87]:
df = add_new_features(train)

In [140]:
X , y = clean_dataframe(df)

In [52]:
# X.head()

In [169]:
X_new = SelectKBest(chi2, k=4).fit_transform(X, y)

In [170]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.33, random_state=42)

In [158]:
class_weight = {1 : 10 , 
                0 : 1}

In [145]:
# oversample = SMOTE()
# X_ov, y_ov = oversample.fit_resample(X_train, y_train)

In [159]:
param_grid = {'criterion':['gini','entropy'],
              'max_depth': np.arange(3, 15) 
             }

In [160]:
model = GridSearchCV(DecisionTreeClassifier(class_weight = class_weight), param_grid)

In [152]:
# model = tree.DecisionTreeClassifier()

In [171]:
model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                              class_weight={0: 1, 1: 10},
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_g

In [172]:
y_pred = model.predict(X_test)

In [173]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.30      0.44      2291
           1       0.20      0.77      0.32       528

    accuracy                           0.39      2819
   macro avg       0.52      0.53      0.38      2819
weighted avg       0.73      0.39      0.42      2819



In [62]:
metrics.f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

0.46744439629984574