In [43]:
%matplotlib inline

import os
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import math
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn import model_selection
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler

SEED = 12345

# Read in final data and transform categorical

In [10]:
def data_reader():
    """
    read data into notebook 
    
    """ 
    train_binary = pd.read_csv("train_session_updated.csv") #, index_col = 0
    if 'Unnamed: 0' in train_binary.columns:
        del train_binary['Unnamed: 0']
    drop_feature = ['user_id', 'total_secs_elapsed', 'date_account_created','timestamp_first_active','date_first_booking','country_destination']
    train_binary.drop(drop_feature, axis=1, inplace=True)
    
    return train_binary


In [11]:
train_binary = data_reader()

In [12]:
train_binary.head()

Unnamed: 0,ajax_refresh_subtotal,dashboard,edit,header_userpic,personalize,similar_listings,total_actions,obs_count,unique_action,unique_device,...,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,isNDF
0,2,4,0,2,4,3,15,3.713572,13,2,...,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,False
1,0,2,0,2,0,0,4,6.672033,25,2,...,facebook,25,en,direct,direct,linked,iOS,iPhone,Mobile Safari,False
2,21,3,0,2,26,12,64,6.194405,20,1,...,basic,0,en,sem-brand,google,omg,Web,Mac Desktop,Safari,False
3,12,2,18,2,12,17,63,5.181784,38,2,...,basic,0,en,other,other,untracked,Web,Other/Unknown,-unknown-,True
4,1,4,0,1,4,1,11,3.295837,11,2,...,basic,0,en,direct,direct,untracked,Web,iPhone,Mobile Safari,True


In [14]:
train_binary.shape

(57898, 23)

In [15]:
categorical = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel',
                'affiliate_provider','first_affiliate_tracked','signup_app','first_device_type',
               'first_browser']    

In [16]:
# Convert data type as 'category'
for i in categorical:
    train_binary[i] = train_binary[i].astype('category')

In [17]:
# Create dummy variables
train_binary_dummy = pd.get_dummies(train_binary, columns = categorical)
train_binary_dummy.head()

Unnamed: 0,ajax_refresh_subtotal,dashboard,edit,header_userpic,personalize,similar_listings,total_actions,obs_count,unique_action,unique_device,...,first_browser_RockMelt,first_browser_Safari,first_browser_SeaMonkey,first_browser_Silk,first_browser_SiteKiosk,first_browser_Sogou Explorer,first_browser_TenFourFox,first_browser_TheWorld Browser,first_browser_Yandex.Browser,first_browser_wOSBrowser
0,2,4,0,2,4,3,15,3.713572,13,2,...,0,1,0,0,0,0,0,0,0,0
1,0,2,0,2,0,0,4,6.672033,25,2,...,0,0,0,0,0,0,0,0,0,0
2,21,3,0,2,26,12,64,6.194405,20,1,...,0,1,0,0,0,0,0,0,0,0
3,12,2,18,2,12,17,63,5.181784,38,2,...,0,0,0,0,0,0,0,0,0,0
4,1,4,0,1,4,1,11,3.295837,11,2,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Split data into response and predictors
y = train_binary_dummy['isNDF']
x = train_binary_dummy.drop('isNDF', axis=1)

In [32]:
x.shape

(57898, 130)

In [45]:
x.columns

Index(['ajax_refresh_subtotal', 'dashboard', 'edit', 'header_userpic',
       'personalize', 'similar_listings', 'total_actions', 'obs_count',
       'unique_action', 'unique_device',
       ...
       'first_browser_RockMelt', 'first_browser_Safari',
       'first_browser_SeaMonkey', 'first_browser_Silk',
       'first_browser_SiteKiosk', 'first_browser_Sogou Explorer',
       'first_browser_TenFourFox', 'first_browser_TheWorld Browser',
       'first_browser_Yandex.Browser', 'first_browser_wOSBrowser'],
      dtype='object', length=130)

# Fit Logistic Model

In [168]:
def fit_logistic(x_train, y_train, fold=5, seed=SEED, penalty='l2', C=1.0):
    """Logistic regression for classification"""
    
    kfold = model_selection.KFold(n_splits=fold, random_state = seed)
    model = LogisticRegression(penalty=penalty, C=C)
    results_auc = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring='roc_auc')
    results_accuracy = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')
    print("%s-fold cross validation average %s: %.4f" % (str(fold), 'roc_auc', results_auc.mean()))
    print("%s-fold cross validation average %s: %.4f" % (str(fold), 'accuracy', results_accuracy.mean()))
    model.fit(x_train, y_train)
    
    return(model)

In [81]:
logistic_base = fit_logistic(x, y, fold=5, seed=SEED)
logistic_base

5-fold cross validation average roc_auc: 0.7226
5-fold cross validation average accuracy: 0.6749


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [86]:
#CV misclass rate
scores = cross_val_score(logistic_base, x, y, cv=5)
avg_missclass = sum(1-i for i in scores)/5
print('5-fold cross validation missclass rate:', round(avg_missclass, 4))

5-fold cross validation missclass rate: 0.3253


# Select Features
Feature selection is a process where you automatically select those features in your data that contribute most to the prediction variable or output in which you are interested.

Having too many irrelevant features in your data can decrease the accuracy of the models. Three benefits of performing feature selection before modeling your data are:

- Reduces Overfitting: Less redundant data means less opportunity to make decisions based on noise.
- Improves Accuracy: Less misleading data means modeling accuracy improves.
- Reduces Training Time: Less data means that algorithms train faster.

Feature selection methods provided by the scikit-learn Python library include Recursive Feature Elimination.

## Recursive Feature Elimination 

The Recursive Feature Elimination (or RFE) works by recursively removing attributes and building a model on those attributes that remain. It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute.



In [164]:
# Recursive Feature Elimination

n_features = 50
rfe = RFE(logistic_base, n_features)
rfe = rfe.fit(x, y)
important_feats = x.iloc[:, rfe.support_].columns

In [165]:
# summarize the selection of the attributes
print("Num Features:", str(n_features))
print()
print("Selected Features: \n", important_feats)
print()
# rank = 1 means most important
print("Feature Ranking: \n")
print(sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), columns)))

Num Features: 50

Selected Features: 
 Index(['gender_-unknown-', 'signup_method_facebook', 'signup_method_google',
       'signup_flow_12', 'signup_flow_21', 'language_da', 'language_en',
       'language_es', 'language_id', 'language_it', 'language_ja',
       'language_ko', 'language_nl', 'language_pl', 'language_pt',
       'language_ru', 'language_tr', 'affiliate_channel_api',
       'affiliate_channel_content', 'affiliate_provider_craigslist',
       'affiliate_provider_daum', 'affiliate_provider_facebook-open-graph',
       'affiliate_provider_meetup', 'affiliate_provider_padmapper',
       'affiliate_provider_yandex', 'first_affiliate_tracked_local ops',
       'signup_app_Moweb', 'first_device_type_Android Phone',
       'first_device_type_iPhone', 'first_browser_-unknown-',
       'first_browser_AOL Explorer', 'first_browser_Android Browser',
       'first_browser_Apple Mail', 'first_browser_Avant Browser',
       'first_browser_BlackBerry Browser', 'first_browser_Chromium',


In [166]:
# Refit model using important features
keep = important_feats.tolist()
x_important = x[keep]

logistic_feat_select = fit_logistic(x_important, y, fold=5, seed=SEED)
logistic_feat_select

5-fold cross validation average roc_auc: 0.6669
5-fold cross validation average accuracy: 0.6547


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Feature selection using L1 regularization (Lasso)

In [170]:
C = [10, 1, .1, .001] #try various regularization strengths

for c in C:
    logistic_lasso = fit_logistic(x, y, fold=5, seed=SEED, penalty='l1', C=c)
    print(logistic_lasso)
    
#best is c = 1 (default)
logistic_lasso = fit_logistic(x, y, fold=5, seed=SEED, penalty='l1')

5-fold cross validation average roc_auc: 0.7223
5-fold cross validation average accuracy: 0.6746
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
5-fold cross validation average roc_auc: 0.7228
5-fold cross validation average accuracy: 0.6747
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
5-fold cross validation average roc_auc: 0.7227
5-fold cross validation average accuracy: 0.6746
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_

In [87]:
#CV misclass rate
scores = cross_val_score(logistic_lasso, x, y, cv=5)
avg_missclass = sum(1-i for i in scores)/5
print('5-fold cross validation missclass rate:', round(avg_missclass, 4))

5-fold cross validation missclass rate: 0.3253


In [144]:
#Look at coefficients

coefficients = pd.concat([pd.DataFrame(x.columns),pd.DataFrame(np.transpose(logistic_lasso.coef_))], axis = 1)
coefficients.columns = ['var', 'coeff']
coefficients

features = coefficients.loc[coefficients['coeff'] != 0.0]
features.shape #87 features of influence
features.sort_values(by='coeff', ascending=False)

Unnamed: 0,var,coeff
51,affiliate_channel_content,1.297577
12,gender_-unknown-,1.219412
17,signup_method_facebook,1.133045
18,signup_method_google,0.852497
65,affiliate_provider_facebook-open-graph,0.673315
43,language_pl,0.616245
44,language_pt,0.571504
36,language_id,0.555125
97,first_browser_Android Browser,0.499438
83,signup_app_Moweb,0.451059
