In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

In [3]:
t = pd.read_csv('raw_data/train.csv')

campaign = pd.read_csv('raw_data/campaign_data.csv')

v = pd.read_csv('raw_data/test.csv')

In [4]:
t.head()

Unnamed: 0,id,user_id,campaign_id,send_date,is_open,is_click
0,42_14051,14051,42,01-09-2017 19:55,0,0
1,52_134438,134438,52,02-11-2017 12:53,0,0
2,33_181789,181789,33,24-07-2017 15:15,0,0
3,44_231448,231448,44,05-09-2017 11:36,0,0
4,29_185580,185580,29,01-07-2017 18:01,0,0


In [5]:
t.dtypes

id             object
user_id         int64
campaign_id     int64
send_date      object
is_open         int64
is_click        int64
dtype: object

### Individual features

In [6]:
t['send_date'] = pd.to_datetime(t['send_date'],format = '%d-%m-%Y %H:%M')
t['day'] = t['send_date'].dt.day

In [7]:
v['send_date'] = pd.to_datetime(v['send_date'],format = '%d-%m-%Y %H:%M')
v['day'] = v['send_date'].dt.day

In [8]:
user_frequencies = nltk.FreqDist(list(t.user_id) + list(v.user_id))

In [9]:
t['frequency_encoding'] = t['user_id'].apply(lambda x:user_frequencies[x])

In [10]:
v['frequency_encoding'] = v['user_id'].apply(lambda x:user_frequencies[x])

In [11]:
campaign['subject_len'] = campaign['subject'].apply(lambda x:len(x))
campaign['body_len'] = campaign['email_body'].apply(lambda x:len(x))

In [12]:
campaign.communication_type.value_counts()

Hackathon          13
Corporate          12
Newsletter          9
Conference          8
Upcoming Events     7
Others              2
Webinar             1
Name: communication_type, dtype: int64

In [13]:
c = pd.get_dummies(campaign,columns=['communication_type'],drop_first=True)

### Merging the campaign data with train and test

In [14]:
mails_train = t.join(c.set_index('campaign_id'), on='campaign_id',how='left')
mails_test = v.join(c.set_index('campaign_id'), on='campaign_id',how='left')

In [15]:
mails_train.head()

Unnamed: 0,id,user_id,campaign_id,send_date,is_open,is_click,day,frequency_encoding,total_links,no_of_internal_links,...,subject,email_url,subject_len,body_len,communication_type_Corporate,communication_type_Hackathon,communication_type_Newsletter,communication_type_Others,communication_type_Upcoming Events,communication_type_Webinar
0,42_14051,14051,42,2017-09-01 19:55:00,0,0,1,16,88,79,...,[September] Exciting days ahead with DataHack ...,http://r.newsletters.analyticsvidhya.com/7v3rd...,88,1383,0,0,1,0,0,0
1,52_134438,134438,52,2017-11-02 12:53:00,0,0,2,11,67,62,...,[Newsletter] Stage for DataHack Summit 2017 is...,http://r.newsletters.analyticsvidhya.com/7vtb2...,111,971,0,0,1,0,0,0
2,33_181789,181789,33,2017-07-24 15:15:00,0,0,24,13,7,3,...,"[Delhi NCR] Fireside Chat with DJ Patil, Forme...",http://r.newsletters.analyticsvidhya.com/7uvlg...,71,739,0,0,0,1,0,0
3,44_231448,231448,44,2017-09-05 11:36:00,0,0,5,11,60,56,...,"[September] Data Science Hackathons, Meetups a...",http://r.newsletters.analyticsvidhya.com/7veam...,73,273,0,0,0,0,1,0
4,29_185580,185580,29,2017-07-01 18:01:00,0,0,1,7,67,61,...,Sneak Peek: A look at the emerging data scienc...,http://r.newsletters.analyticsvidhya.com/7um44...,55,1499,0,0,1,0,0,0


In [16]:
pd.crosstab(mails_train.is_open,mails_train.day)

day,1,2,3,5,6,7,9,11,14,18,19,21,24,26,28,29,30
is_open,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,259238,156652,63760,104475,120064,6635,2258,3336,7182,6739,59289,10453,27668,12033,77155,3418,46
1,29078,9277,9655,12011,5077,924,1229,1986,2649,1965,10335,3777,7727,2173,4203,703,21


## Modeling

In [40]:
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split,cross_val_score
from sklearn.metrics import roc_auc_score,accuracy_score,precision_score,recall_score,roc_curve,auc
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import LogisticRegression

### is_open model

In [None]:
mails_train.columns

In [None]:
feats = ['day','subject_len', 'communication_type_Corporate','communication_type_Hackathon',
         'communication_type_Newsletter','communication_type_Others',
         'communication_type_Upcoming Events','communication_type_Webinar','frequency_encoding',
         'total_links','no_of_internal_links','no_of_images','no_of_sections']

In [None]:
X = mails_train[feats]
Y = mails_train['is_open']

In [None]:
trainX, testX, trainY, testY =  train_test_split(X, Y, test_size = .3, random_state = 166)

In [None]:
rf = RandomForestClassifier(class_weight='balanced',random_state = 1)#,n_estimators=100)
model = rf.fit(trainX, trainY)

In [None]:
preds = model.predict(trainX)
auc_m = roc_auc_score(trainY, preds)
accuracy = accuracy_score(trainY, preds)
precision = precision_score(trainY, preds)
recall = recall_score(trainY, preds)
print auc_m,accuracy,precision,recall

In [None]:
preds = model.predict(testX)
auc_m = roc_auc_score(testY, preds)
accuracy = accuracy_score(testY, preds)
precision = precision_score(testY, preds)
recall = recall_score(testY, preds)
print auc_m,accuracy,precision,recall

In [None]:
zipped = zip(trainX.columns,model.feature_importances_)
zipped.sort(key=lambda t:t[1],reverse=True)
fi = zipped

In [None]:
fi

In [None]:
skf = StratifiedKFold(Y,n_folds=5,shuffle=True)
for train_index, test_index in skf:
    trainX, testX = X.ix[list(train_index)], X.ix[list(test_index)]
    trainY, testY = Y.ix[list(train_index)], Y.ix[list(test_index)]
    rf = RandomForestClassifier(class_weight='balanced',random_state = None,n_estimators=100)
    model = rf.fit(trainX, trainY)
    preds = model.predict(testX)
    auc_m = roc_auc_score(testY, preds)
    accuracy = accuracy_score(testY, preds)
    precision = precision_score(testY, preds)
    recall = recall_score(testY, preds)
    print auc_m,accuracy,precision,recall

In [None]:
Sample_weight = [1 if i==0 else 9 for i in trainY]
gbm = GradientBoostingClassifier(random_state =1)
model = gbm.fit(trainX, trainY,sample_weight=Sample_weight)

In [None]:
preds = model.predict(testX)
auc_m = roc_auc_score(testY, preds)
accuracy = accuracy_score(testY, preds)
precision = precision_score(testY, preds)
recall = recall_score(testY, preds)
print auc_m,accuracy,precision,recall

In [None]:
skf = StratifiedKFold(Y,n_folds=3,shuffle=True)
for train_index, test_index in skf:
    trainX, testX = X.ix[list(train_index)], X.ix[list(test_index)]
    trainY, testY = Y.ix[list(train_index)], Y.ix[list(test_index)]
    Sample_weight = [1 if i==0 else 9 for i in trainY]
    gbm = GradientBoostingClassifier(random_state =1)
    model = gbm.fit(trainX, trainY,sample_weight=Sample_weight)
    preds = model.predict(testX)
    auc_m = roc_auc_score(testY, preds)
    accuracy = accuracy_score(testY, preds)
    precision = precision_score(testY, preds)
    recall = recall_score(testY, preds)
    print auc_m,accuracy,precision,recall

### is_click model

In [18]:
mails_opened = mails_train

In [19]:
mails_opened.is_click.value_counts()

0    1010409
1      12782
Name: is_click, dtype: int64

In [21]:
feats1 = ['day','subject_len', 'communication_type_Corporate','communication_type_Hackathon',
         'communication_type_Newsletter','communication_type_Others',
         'communication_type_Upcoming Events','communication_type_Webinar','frequency_encoding',
         'body_len','total_links','no_of_internal_links','no_of_images','no_of_sections']

In [22]:
X1 = mails_opened[feats1]
Y1 = mails_opened['is_click']

In [23]:
trainX, testX, trainY, testY =  train_test_split(X1, Y1, test_size = .3, random_state = 166)

In [31]:
rf = RandomForestClassifier(class_weight={0:2,1:27},random_state = 1)#,n_estimators=100)
model = rf.fit(trainX, trainY)

In [32]:
preds = model.predict(trainX)
auc_m = roc_auc_score(trainY, preds)
accuracy = accuracy_score(trainY, preds)
precision = precision_score(trainY, preds)
recall = recall_score(trainY, preds)
print auc_m,accuracy,precision,recall

0.515989304989 0.983996827848 0.10193099082 0.0359897172237


In [33]:
preds = model.predict(testX)
auc_m = roc_auc_score(testY, preds)
accuracy = accuracy_score(testY, preds)
precision = precision_score(testY, preds)
recall = recall_score(testY, preds)
print auc_m,accuracy,precision,recall

0.51210315737 0.983698095505 0.0785868781543 0.0284224250326


In [None]:
skf = StratifiedKFold(Y,n_folds=5,shuffle=True)
for train_index, test_index in skf:
    trainX, testX = X.ix[list(train_index)], X.ix[list(test_index)]
    trainY, testY = Y.ix[list(train_index)], Y.ix[list(test_index)]
    rf = RandomForestClassifier(class_weight='balanced',random_state = None,n_estimators=100)
    model = rf.fit(trainX, trainY)
    preds = model.predict(testX)
    auc_m = roc_auc_score(testY, preds)
    accuracy = accuracy_score(testY, preds)
    precision = precision_score(testY, preds)
    recall = recall_score(testY, preds)
    print auc_m,accuracy,precision,recall

In [37]:
Sample_weight = [1 if i==0 else 70 for i in trainY]
gbm = GradientBoostingClassifier(random_state =1)
model = gbm.fit(trainX, trainY,sample_weight=Sample_weight)

In [38]:
preds = model.predict(trainX)
auc_m = roc_auc_score(trainY, preds)
accuracy = accuracy_score(trainY, preds)
precision = precision_score(trainY, preds)
recall = recall_score(trainY, preds)
print auc_m,accuracy,precision,recall

0.594269200649 0.787041367823 0.0235470576133 0.396557505309


In [39]:
preds = model.predict(testX)
auc_m = roc_auc_score(testY, preds)
accuracy = accuracy_score(testY, preds)
precision = precision_score(testY, preds)
recall = recall_score(testY, preds)
print auc_m,accuracy,precision,recall

0.591960959768 0.786752585044 0.0232705667559 0.392177314211


In [44]:
Sample_weight = [1 if i==0 else 50 for i in trainY]
lr = LogisticRegression(random_state =1)
model = lr.fit(trainX, trainY,sample_weight=Sample_weight)

In [45]:
preds = model.predict(trainX)
auc_m = roc_auc_score(trainY, preds)
accuracy = accuracy_score(trainY, preds)
precision = precision_score(trainY, preds)
recall = recall_score(trainY, preds)
print auc_m,accuracy,precision,recall

0.55449442756 0.935811390986 0.0365948286651 0.163406728512


In [46]:
preds = model.predict(testX)
auc_m = roc_auc_score(testY, preds)
accuracy = accuracy_score(testY, preds)
precision = precision_score(testY, preds)
recall = recall_score(testY, preds)
print auc_m,accuracy,precision,recall

0.555184067968 0.935307110419 0.0366664738881 0.165319426336


## Scoring

In [None]:
mails_test['is_open'] = 0
mails_test['is_click'] = 0

In [None]:
#mails_test['is_open'] = model.predict(mails_test[feats])

In [None]:
mails_test['is_click'] = model1.predict(mails_test[feats1])

In [None]:
#pd.crosstab(mails_test['is_click'],mails_test['is_open'])

In [None]:
#mails_test['is_click'] = mails_test[['is_open','is_click']].apply(lambda x:0 if x[0]==0 else x[1],axis=1)
#df['Value'] = df.apply(lambda row: my_test(row[a], row[c]), axis=1)

In [None]:
#pd.crosstab(mails_test['is_click'],mails_test['is_open'])

In [None]:
mails_test[['id','is_click']].to_csv('sub3.csv',index=False)