# Click-Through Rate Prediction

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_csv('train.gz',compression='gzip')

In [None]:
train.head()

In [None]:
train['hour']=train['hour'].apply(lambda x: x + 2000000000)

In [None]:
train['hour']=train['hour'].apply(lambda x: datetime.strptime(str(x), '%Y%m%d%H').strftime('%Y-%m-%d-%H'))

In [None]:
train = train.rename(columns={"hour": "date"})

In [None]:
train.groupby('date')['id'].count().reset_index()

The data covers 10 days of click streams data from 2014-10-21 to 2014-10-30, we just pick all the data on '2014-10-21'

In [None]:
train = train[(train['date']>='2014-10-21-00')&(train['date']<='2014-10-21-23')]

In [None]:
train = pd.read_csv("finaltrain.csv")

In [None]:
train.shape

(4122995, 24)

In [None]:
train.dtypes

In [None]:
train['click'].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='click',data=train, palette='hls')
plt.show();

In [None]:
train['click'].value_counts()/len(train)

Click through rate is approx. 17%, and approx. 83% is not clicked.

###  Feature engineering for date time features

#### Hour

Extract hour from date time feature.

In [None]:
train['hour_of_day'] = train['date'].apply(lambda x: int(x[-2:]))

In [None]:
train.groupby('hour_of_day').agg({'click':'sum'}).plot(figsize=(12,6))
plt.ylabel('Number of clicks')
plt.title('click trends by hour of day');
plt.show()

In [None]:
train.head(3)

In [None]:
train.groupby(['hour_of_day', 'click']).size().unstack().plot(kind='bar', title="Hour of Day", figsize=(12,6))
plt.ylabel('count')
plt.title('Hourly impressions vs. clicks');
plt.show()

There is nothing shocking here.

Now that we have click and impression. We can calculate Click-through rate (CTR). CTR is the ratio of ad clicks to impressions. It measures the rate of clicks on each ad.

#### Hourly CTR

In [None]:
import seaborn as sns

df_click = train[train['click'] == 1]
df_hour = train[['hour_of_day','click']].groupby(['hour_of_day']).count().reset_index()
df_hour = df_hour.rename(columns={'click': 'impressions'})
df_hour['clicks'] = df_click[['hour_of_day','click']].groupby(['hour_of_day']).count().reset_index()['click']
df_hour['CTR'] = df_hour['clicks']/df_hour['impressions']*100

plt.figure(figsize=(12,6))
sns.barplot(y='CTR', x='hour_of_day', data=df_hour)
plt.title('Hourly CTR');
plt.show()

### C1 feature

C1 is one of the anonymized categorical features

In [None]:
print(train.C1.value_counts()/len(train))

C1 value = 1005 has the most data, almost 92%. And then we can calculate the CTR of each C1 value.

In [None]:
C1_values = train.C1.unique()
C1_values.sort()
ctr_avg_list=[]
for i in C1_values:
    ctr_avg=train.loc[np.where((train.C1 == i))].click.mean()
    ctr_avg_list.append(ctr_avg)
    print("{}: click through rate: {}".format(i,ctr_avg))

In [None]:
train.groupby(['C1', 'click']).size().unstack().plot(kind='bar', figsize=(12,6), title='C1 histogram');
plt.show()

In [None]:
df_c1 = train[['C1','click']].groupby(['C1']).count().reset_index()
df_c1 = df_c1.rename(columns={'click': 'impressions'})
df_c1['clicks'] = df_click[['C1','click']].groupby(['C1']).count().reset_index()['click']
df_c1['CTR'] = df_c1['clicks']/df_c1['impressions']*100

plt.figure(figsize=(12,6))
sns.barplot(y='CTR', x='C1', data=df_c1)
plt.title('CTR by C1');
plt.show()

The average CTR in the data is 0.17.

In [None]:
train['click'].mean()

In [None]:
df_c1.CTR.describe()

The important C1 values and CTR pairs are:

1005: 92% of the data and 0.17 CTR

1002: 5.5% of the data and 0.21 CTR

1010: 2.3% of the data and 0.096 CTR

1002 has a much higher than average CTR, and 1010 has a much lower than average CTR, it seems these two C1 values are important for predicting CTR.

### Banner position



In [None]:
print(train.banner_pos.value_counts()/len(train))

In [None]:
banner_pos = train.banner_pos.unique()
banner_pos.sort()
ctr_avg_list=[]
for i in banner_pos:
    ctr_avg=train.loc[np.where((train.banner_pos == i))].click.mean()
    ctr_avg_list.append(ctr_avg)
    print("{}: click through rate: {}".format(i,ctr_avg))

The important banner positions are:

position 0: 72% of the data and 0.16 CTR

position 1: 28% of the data and 0.18 CTR

In [None]:
train.groupby(['banner_pos', 'click']).size().unstack().plot(kind='bar', figsize=(12,6), title='banner position histogram');
plt.show()

In [None]:
df_banner = train[['banner_pos','click']].groupby(['banner_pos']).count().reset_index()
df_banner = df_banner.rename(columns={'click': 'impressions'})
df_banner['clicks'] = df_click[['banner_pos','click']].groupby(['banner_pos']).count().reset_index()['click']
df_banner['CTR'] = df_banner['clicks']/df_banner['impressions']*100
sort_banners = df_banner.sort_values(by='CTR',ascending=False)['banner_pos'].tolist()
plt.figure(figsize=(12,6))
sns.barplot(y='CTR', x='banner_pos', data=df_banner, order=sort_banners)
plt.title('CTR by banner position');
plt.show()

In [None]:
df_banner.CTR.describe()

### Site features

#### site id

In [None]:
print("There are {} sites in the data set".format(train.site_id.nunique()))

In [None]:
print('The top 10 site ids that have the most impressions')
print((train.site_id.value_counts()/len(train))[0:10])

In [None]:
top10_ids = (train.site_id.value_counts()/len(train))[0:10].index
click_avg_list=[]

for i in top10_ids:
    click_avg=train.loc[np.where((train.site_id == i))].click.mean()
    click_avg_list.append(click_avg)
    print("for site id value: {},  click through rate: {}".format(i,click_avg))

In [None]:
top10_sites = train[(train.site_id.isin((train.site_id.value_counts()/len(train))[0:10].index))]
top10_sites_click = top10_sites[top10_sites['click'] == 1]
top10_sites.groupby(['site_id', 'click']).size().unstack().plot(kind='bar', figsize=(12,6), title='Top 10 site ids histogram');
plt.show()

In [None]:
df_site = top10_sites[['site_id','click']].groupby(['site_id']).count().reset_index()
df_site = df_site.rename(columns={'click': 'impressions'})
df_site['clicks'] = top10_sites_click[['site_id','click']].groupby(['site_id']).count().reset_index()['click']
df_site['CTR'] = df_site['clicks']/df_site['impressions']*100
sort_site = df_site.sort_values(by='CTR',ascending=False)['site_id'].tolist()
plt.figure(figsize=(12,6))
sns.barplot(y='CTR', x='site_id', data=df_site, order=sort_site)
plt.title('CTR by top 10 site id');
plt.show()

The CTR for the top 10 sites are significantly different from overall CTR 0.16.

#### site domain

In [None]:
print("There are {} site domains in the data set".format(train.site_domain.nunique()))

In [None]:
print('The top 10 site domains that have the most impressions')
print((train.site_domain.value_counts()/len(train))[0:10])

In [None]:
top10_domains = (train.site_domain.value_counts()/len(train))[0:10].index
click_avg_list=[]

for i in top10_domains:
    click_avg=train.loc[np.where((train.site_domain == i))].click.mean()
    click_avg_list.append(click_avg)
    print("for site domain value: {},  click through rate: {}".format(i,click_avg))

In [None]:
top10_domain = train[(train.site_domain.isin((train.site_domain.value_counts()/len(train))[0:10].index))]
top10_domain_click = top10_domain[top10_domain['click'] == 1]
top10_domain.groupby(['site_domain', 'click']).size().unstack().plot(kind='bar', figsize=(12,6), title='Top 10 site domains histogram');
plt.show()

In [None]:
df_domain = top10_domain[['site_domain','click']].groupby(['site_domain']).count().reset_index()
df_domain = df_domain.rename(columns={'click': 'impressions'})
df_domain['clicks'] = top10_domain_click[['site_domain','click']].groupby(['site_domain']).count().reset_index()['click']
df_domain['CTR'] = df_domain['clicks']/df_domain['impressions']*100
sort_domain = df_domain.sort_values(by='CTR',ascending=False)['site_domain'].tolist()
plt.figure(figsize=(12,6))
sns.barplot(y='CTR', x='site_domain', data=df_domain, order=sort_domain)
plt.title('CTR by top 10 site domain');
plt.show()

Similar with the site_id feature, the site_domain feature seems important as well.

#### site category

In [None]:
print("There are {} site categories in the data set".format(train.site_category.nunique()))

In [None]:
print('The top 10 site categories that have the most impressions')
print((train.site_category.value_counts()/len(train))[0:10])

In [None]:
top10_categories = (train.site_category.value_counts()/len(train))[0:10].index
click_avg_list=[]

for i in top10_categories:
    click_avg=train.loc[np.where((train.site_category == i))].click.mean()
    click_avg_list.append(click_avg)
    print("for site category value: {},  click through rate: {}".format(i,click_avg))

In [None]:
top10_category = train[(train.site_category.isin((train.site_category.value_counts()/len(train))[0:10].index))]
top10_category_click = top10_category[top10_category['click'] == 1]
top10_category.groupby(['site_category', 'click']).size().unstack().plot(kind='bar', figsize=(12,6), title='Top 10 site categories histogram');
plt.show()

In [None]:
df_category = top10_category[['site_category','click']].groupby(['site_category']).count().reset_index()
df_category = df_category.rename(columns={'click': 'impressions'})
df_category['clicks'] = top10_category_click[['site_category','click']].groupby(['site_category']).count().reset_index()['click']
df_category['CTR'] = df_category['clicks']/df_category['impressions']*100
sort_category = df_category.sort_values(by='CTR',ascending=False)['site_category'].tolist()
plt.figure(figsize=(12,6))
sns.barplot(y='CTR', x='site_category', data=df_category, order=sort_category)
plt.title('CTR by top 10 site category');
plt.show()

### Device features

#### device id

In [None]:
print("There are {} devices in the data set".format(train.device_id.nunique()))

In [None]:
print('The top 10 devices that have the most impressions')
print((train.device_id.value_counts()/len(train))[0:10])

In [None]:
top10_devices = (train.device_id.value_counts()/len(train))[0:10].index
click_avg_list=[]

for i in top10_devices:
    click_avg=train.loc[np.where((train.device_id == i))].click.mean()
    click_avg_list.append(click_avg)
    print("for device id value: {},  click through rate: {}".format(i,click_avg))

In [None]:
top10_device = train[(train.device_id.isin((train.device_id.value_counts()/len(train))[0:10].index))]
top10_device_click = top10_device[top10_device['click'] == 1]
top10_device.groupby(['device_id', 'click']).size().unstack().plot(kind='bar', figsize=(12,6), title='Top 10 device ids histogram');
plt.show()

#### device ip

Device ip is more of a users ip address, so, there are a lot of them.

In [None]:
print("There are {} device ips in the data set".format(train.device_ip.nunique()))
print("There are {} device types in the data set".format(train.device_type.nunique()))
print("There are {} device models in the data set".format(train.device_model.nunique()))
print("There are {} device cnn types in the data set".format(train.device_conn_type.nunique()))

#### device type

In [None]:
print('The impressions by device types')
print((train.device_type.value_counts()/len(train)))

In [None]:
train[['device_type','click']].groupby(['device_type','click']).size().unstack().plot(kind='bar', title='device types');
plt.show()

Device type 1 gets the most impressions and clicks. And the other device types only get the minimum impressions and clicks. We may want to look in more details about device type 1.

In [None]:
df_click[df_click['device_type']==1].groupby(['hour_of_day', 'click']).size().unstack().plot(kind='bar', title="Clicks from device type 1 by hour of day", figsize=(12,6));
plt.show()

As expected, most clicks happened during the business hours from device type 1. device type is definitely an important feature.

In [None]:
device_type_click = df_click.groupby('device_type').agg({'click':'sum'}).reset_index()
device_type_impression = train.groupby('device_type').agg({'click':'count'}).reset_index().rename(columns={'click': 'impressions'})
merged_device_type = pd.merge(left = device_type_click , right = device_type_impression, how = 'inner', on = 'device_type')

In [None]:
merged_device_type['CTR'] = merged_device_type['click'] / merged_device_type['impressions']*100

In [None]:
merged_device_type

The highest CTR comes from device type 0.

#### app features

In [None]:
print("There are {} apps in the data set".format(train.app_id.nunique()))
print("There are {} app domains in the data set".format(train.app_domain.nunique()))
print("There are {} app categories in the data set".format(train.app_category.nunique()))

Looks like app category something worth to explore.

In [None]:
print('The impressions by app categories')
print((train.app_category.value_counts()/len(train)))

In [None]:
train['app_category'].value_counts().plot(kind='bar', title='App Category v/s Clicks')

In [None]:
train_app_category = train.groupby(['app_category', 'click']).size().unstack()
train_app_category.div(train_app_category.sum(axis=1), axis=0).plot(kind='bar', stacked=True, title="Intra-category CTR");
plt.show()

#### C14 - C21 features

In [None]:
print("There are {} C14 in the data set".format(train.C14.nunique()))
print("There are {} C15 in the data set".format(train.C15.nunique()))
print("There are {} C16 in the data set".format(train.C16.nunique()))
print("There are {} C17 in the data set".format(train.C17.nunique()))
print("There are {} C18 in the data set".format(train.C18.nunique()))
print("There are {} C19 in the data set".format(train.C19.nunique()))
print("There are {} C20 in the data set".format(train.C20.nunique()))

In [None]:
train.groupby(['C15', 'click']).size().unstack().plot(kind='bar', stacked=True, title='C15 distribution');

In [None]:
train.groupby(['C16', 'click']).size().unstack().plot(kind='bar', stacked=True, title='C16 distribution');

In [None]:
train.groupby(['C18', 'click']).size().unstack().plot(kind='bar', stacked=True, title='C18 distribution');

In [None]:
train.head(3)

In [None]:
def convert_obj_to_int(self):

    object_list_columns = self.columns
    object_list_dtypes = self.dtypes
    new_col_suffix = '_int'
    for index in range(0,len(object_list_columns)):
        if object_list_dtypes[index] == object :
            self[object_list_columns[index]+new_col_suffix] = self[object_list_columns[index]].map( lambda  x: hash(x))
            self.drop([object_list_columns[index]],inplace=True,axis=1)
    return self
train = convert_obj_to_int(train)

In [None]:
train.head(3)

Unnamed: 0,id,click,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,date_int,site_id_int,site_domain_int,site_category_int,app_id_int,app_domain_int,app_category_int,device_id_int,device_ip_int,device_model_int
0,1.000009e+18,0,1005,0,1,2,15706,320,50,1722,0,35,-1,79,-4894633712509586256,563252818748676209,2678970633790439129,936290421697444965,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,-2421482248107373158,-5542378729067133763
1,1.000017e+19,0,1005,0,1,0,15704,320,50,1722,0,35,100084,79,-4894633712509586256,563252818748676209,2678970633790439129,936290421697444965,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,1043140342598562817,5437923702529426165
2,1.000037e+19,0,1005,0,1,0,15704,320,50,1722,0,35,100084,79,-4894633712509586256,563252818748676209,2678970633790439129,936290421697444965,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,8614466526076247504,-4276541463420655263


In [None]:
train.drop('id', axis=1, inplace=True)

In [None]:
train.drop('date_int', axis=1, inplace=True)

In [None]:
train.head()

Unnamed: 0,click,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,site_id_int,site_domain_int,site_category_int,app_id_int,app_domain_int,app_category_int,device_id_int,device_ip_int,device_model_int
0,0,1005,0,1,2,15706,320,50,1722,0,35,-1,79,563252818748676209,2678970633790439129,936290421697444965,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,-2421482248107373158,-5542378729067133763
1,0,1005,0,1,0,15704,320,50,1722,0,35,100084,79,563252818748676209,2678970633790439129,936290421697444965,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,1043140342598562817,5437923702529426165
2,0,1005,0,1,0,15704,320,50,1722,0,35,100084,79,563252818748676209,2678970633790439129,936290421697444965,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,8614466526076247504,-4276541463420655263
3,0,1005,0,1,0,15706,320,50,1722,0,35,100084,79,563252818748676209,2678970633790439129,936290421697444965,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,-8711247937882937440,-101930351471260202
4,0,1005,1,1,0,18993,320,50,2161,0,35,-1,157,-8722046770264499790,-8594600896970571619,6796581121582936263,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,-8942623890986269100,-2891236374950521443


# ML

## Logistic Regression

In [None]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train, train['click'], test_size=0.1, random_state=101)

In [None]:
logmodel = LogisticRegression()
logmodel.fit(x_train, y_train)

In [None]:
Predictions = logmodel.predict(x_test)

In [None]:
print(confusion_matrix(y_test, Predictions))

- True positive: 331547(We predicted a positive result and it was positive)
- True negative: 1292(We predicted a negative result and it was negative)
- False positive: 9008(We predicted a positive result and it was negative)
- False negative: 70453(We predicted a negative result and it was positive)

- Accuracy = (TP+TN)/total
- Accuracy = (331547+1292)/112300 ~ 80.7%

## Degree-2 Polynomial Mappings (Poly2)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly = PolynomialFeatures(degree = 2)
x_train_poly = poly.fit_transform(x_train)
poly_model = LogisticRegression()
poly_model.fit(x_train_poly, y_train)

In [None]:
Predictions_2 = poly_model.predict(x_test)

In [None]:
print(classification_report(y_test,Predictions_2))

In [None]:
print(confusion_matrix(y_test, Predictions_2))

# Convert Data From csv to libffm

In [None]:
class FFMFormatPandas:
    def __init__(self):
        self.field_index_ = None
        self.feature_index_ = None
        self.y = None

    def fit(self, df, y=None):
        self.y = y
        df_ffm = df[df.columns.difference([self.y])]
        if self.field_index_ is None:
            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}

        if self.feature_index_ is not None:
            last_idx = max(list(self.feature_index_.values()))

        if self.feature_index_ is None:
            self.feature_index_ = dict()
            last_idx = 0

        for col in df.columns:
            vals = df[col].unique()
            for val in vals:
                if pd.isnull(val):
                    continue
                name = '{}_{}'.format(col, val)
                if name not in self.feature_index_:
                    self.feature_index_[name] = last_idx
                    last_idx += 1
            self.feature_index_[col] = last_idx
            last_idx += 1
        return self

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df)

    def transform_row_(self, row, t):
        ffm = []
        if self.y != None:
            ffm.append(str(row.loc[row.index == self.y][0]))
        if self.y is None:
            ffm.append(str(0))

        for col, val in row.loc[row.index != self.y].to_dict().items():
            col_type = t[col]
            name = '{}_{}'.format(col, val)
            if col_type.kind ==  'O':
                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
            elif col_type.kind == 'i':
                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
        return ' '.join(ffm)

    def transform(self, df):
        t = df.dtypes.to_dict()
        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})

In [None]:
ffm_train = FFMFormatPandas()
ffm_train_data = ffm_train.fit_transform(train, y='click')

In [None]:
X_train, X_test = train_test_split(ffm_train_data, test_size = 0.1, random_state = 5)

In [None]:
X_train.to_csv('x_train.txt', header=None, index=None, sep=' ', mode='a')
X_test.to_csv('x_test.txt', header=None, index=None, sep=' ', mode='a')

# FM

In [None]:
fm_model = xl.create_fm()
fm_model.setTrain('x_train.txt')
fm_model.setValidate('x_test.txt')
param = {'task':'binary', 'lr':0.2, 'lambda':0.002}
fm_model.fit(param, "./model.out")

In [None]:
fm_model.cv(param)

In [None]:
fm_model.setTest("x_test.txt")
fm_model.setSigmoid()
fm_model.predict("trained_models/model.out", "output/predictions.txt")

In [None]:
print(classification_report(y_test,Predictions))

# FFM

In [None]:
ffm_model = xl.create_ffm()
ffm_model.setTrain('x_train.txt')
ffm_model.setValidate('x_test.txt')
param = {'task':'binary', 'lr':0.2, 'lambda':0.002}
ffm_model.fit(param, "./model.out")

In [None]:
ffm_model.cv(param)

In [None]:
ffm_model.setTest("x_test.txt")
ffm_model.setSigmoid()
ffm_model.predict("trained_models/model.out", "output/predictions.txt")

In [None]:
print(classification_report(y_test,Predictions))

## Imbalance - Learn

### Over-Sampling Method

In [None]:
from imblearn.datasets import make_imbalance
from imblearn.over_sampling import (SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC,
                                    KMeansSMOTE)
from sklearn.utils import resample
from sklearn.metrics import accuracy_score

In [None]:
# Separate majority and minority classes
df_majority = train[train.click==0]
df_minority = train[train.click==1]

In [None]:
df_minority_upsampled = resample(df_minority,
                                 replace=True,     # sample with replacement
                                 n_samples=3404777,    # to match majority class
                                 random_state=123) # reproducible results

In [None]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [None]:
df_upsampled.click.value_counts()

1    3404777
0    3404777
Name: click, dtype: int64

In [None]:
y = df_upsampled.click
X = df_upsampled.drop('click', axis=1)

In [None]:
clf_1 = LogisticRegression().fit(X, y)



In [None]:
pred_y_1 = clf_1.predict(X)

In [None]:
print(np.unique(pred_y_1))

[0 1]


In [None]:
print(accuracy_score(y, pred_y_1))

0.5652167528152358


In [None]:
print(classification_report(y, pred_y_1))

              precision    recall  f1-score   support

           0       0.59      0.45      0.51   3404777
           1       0.55      0.68      0.61   3404777

    accuracy                           0.57   6809554
   macro avg       0.57      0.57      0.56   6809554
weighted avg       0.57      0.57      0.56   6809554



In [None]:
print(confusion_matrix(y, pred_y_1))

[[1524618 1880159]
 [1080521 2324256]]


### Using SMOTE

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, train['click'], test_size=0.25, random_state=27)

In [None]:
sm = SMOTE(random_state=27, ratio=1.0)

In [None]:
X_train, y_train = sm.fit_sample(X_train, y_train)

In [None]:
smote = LogisticRegression(solver='liblinear').fit(X_train, y_train)

In [None]:
smote_pred = smote.predict(X_test)

In [None]:
accuracy_score(y_test, smote_pred)

0.4883051062867876

In [None]:
print(classification_report(y_test, smote_pred))

              precision    recall  f1-score   support

           0       0.87      0.45      0.59    851178
           1       0.21      0.68      0.32    179571

    accuracy                           0.49   1030749
   macro avg       0.54      0.56      0.45   1030749
weighted avg       0.75      0.49      0.54   1030749



In [None]:
print(confusion_matrix(y_test, smote_pred))

[[380756 470422]
 [ 57007 122564]]


### Under-Sampling

In [None]:
y = train.click
X = train.drop('click', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

In [None]:
X = pd.concat([X_train, y_train], axis=1)

In [None]:
not_click = X[X.click==0]
click = X[X.click==1]

In [None]:
not_click_downsampled = resample(not_click,
                                replace = False, # sample without replacement
                                n_samples = len(click), # match minority n
                                random_state = 27) # reproducible results

In [None]:
downsampled = pd.concat([not_click_downsampled, click])

In [None]:
downsampled.click.value_counts()

1    538647
0    538647
Name: click, dtype: int64

In [None]:
y_train = downsampled.click
X_train = downsampled.drop('click', axis=1)

In [None]:
undersampled = LogisticRegression(solver='liblinear').fit(X_train, y_train)

In [None]:
undersampled_pred = undersampled.predict(X_test)

In [None]:
accuracy_score(y_test, undersampled_pred)

0.488814444641712

In [None]:
print(classification_report(y_test, undersampled_pred))

              precision    recall  f1-score   support

           0       0.87      0.45      0.59    851178
           1       0.21      0.68      0.32    179571

    accuracy                           0.49   1030749
   macro avg       0.54      0.56      0.45   1030749
weighted avg       0.75      0.49      0.54   1030749



In [None]:
print(confusion_matrix(y_test, undersampled_pred))

[[381394 469784]
 [ 57120 122451]]


### Validating and Checking Accuracy on Actual Test Data

In [None]:
test = pd.read_csv('test.gz',compression='gzip')

In [None]:
test.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000017e+19,14103100,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,69f45779,0eb711ec,1,0,8330,320,50,761,3,175,100075,23
1,1.000018e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,e8d44657,ecb851b2,1,0,22676,320,50,2616,0,35,100083,51
2,1.000055e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,10fb085b,1f0bc64f,1,0,22676,320,50,2616,0,35,100083,51
3,1.000109e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,a99f214a,422d257a,542422a7,1,0,18648,320,50,1092,3,809,100156,61
4,1.000138e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,a99f214a,078c6b38,1f0bc64f,1,0,23160,320,50,2667,0,47,-1,221


In [None]:
test['hour']=test['hour'].apply(lambda x: x + 2000000000)

test['hour']=test['hour'].apply(lambda x: datetime.strptime(str(x), '%Y%m%d%H').strftime('%Y-%m-%d-%H'))

test = test.rename(columns={"hour": "date"})

test.groupby('date')['id'].count().reset_index()

Unnamed: 0,date,id
0,2014-10-31-00,90990
1,2014-10-31-01,94948
2,2014-10-31-02,117478
3,2014-10-31-03,98818
4,2014-10-31-04,129317
5,2014-10-31-05,145333
6,2014-10-31-06,268602
7,2014-10-31-07,291074
8,2014-10-31-08,208737
9,2014-10-31-09,299162


In [None]:
test.shape

(4577464, 23)

In [None]:
test = convert_obj_to_int(test)

In [None]:
test.head(3)

Unnamed: 0,id,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,date_int,site_id_int,site_domain_int,site_category_int,app_id_int,app_domain_int,app_category_int,device_id_int,device_ip_int,device_model_int
0,1.000017e+19,1005,0,1,0,8330,320,50,761,3,175,100075,23,-8395007613602687299,-3800484223899672490,1318479553209310443,-8893414329068012023,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,-7118180830746867152,-1586125472635000893
1,1.000018e+19,1005,0,1,0,22676,320,50,2616,0,35,100083,51,-8395007613602687299,563252818748676209,2678970633790439129,936290421697444965,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,1020342899736387605,-8797371277426231385
2,1.000055e+19,1005,0,1,0,22676,320,50,2616,0,35,100083,51,-8395007613602687299,563252818748676209,2678970633790439129,936290421697444965,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,1319519836399229941,6324588784001879413


In [None]:
test.drop('id', axis=1, inplace=True)
test.drop('date_int', axis=1, inplace=True)

test.head()

Unnamed: 0,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,site_id_int,site_domain_int,site_category_int,app_id_int,app_domain_int,app_category_int,device_id_int,device_ip_int,device_model_int
0,1005,0,1,0,8330,320,50,761,3,175,100075,23,-3800484223899672490,1318479553209310443,-8893414329068012023,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,-7118180830746867152,-1586125472635000893
1,1005,0,1,0,22676,320,50,2616,0,35,100083,51,563252818748676209,2678970633790439129,936290421697444965,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,1020342899736387605,-8797371277426231385
2,1005,0,1,0,22676,320,50,2616,0,35,100083,51,563252818748676209,2678970633790439129,936290421697444965,-4829673787941248262,-613020616740582092,-1508161623420999242,-6325914545430670019,1319519836399229941,6324588784001879413
3,1005,0,1,0,18648,320,50,1092,3,809,100156,61,-3041040940942331937,-1758382057870279444,1995042437856922505,-3539984906138104239,8575684353713020006,6742733635879456869,-6325914545430670019,7837753533344330423,6092412861452464846
4,1005,0,1,0,23160,320,50,2667,0,47,-1,221,-3041040940942331937,-1758382057870279444,1995042437856922505,3958632988940458928,1194798377035643008,169897843225842710,-6325914545430670019,-3822529733723338898,6324588784001879413


In [None]:
pred_y_1 = clf_1.predict(test)

In [None]:
print(np.unique(pred_y_1))

[0 1]


In [None]:
print(pred_y_1)

[1 1 1 ... 1 0 0]


In [None]:
sampleSubmission = pd.read_csv('sampleSubmission.gz',compression='gzip')

In [None]:
sampleSubmission.head()

Unnamed: 0,id,click
0,10000174058809263569,0.5
1,10000182526920855428,0.5
2,10000554139829213984,0.5
3,10001094637809798845,0.5
4,10001377041558670745,0.5


In [None]:
sampleSubmission['click'] = pred_y_1

In [None]:
sampleSubmission.to_csv('sampleSubmission.csv', mode = 'w', index=False)