In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [55]:
# Read the train and the test data 
#The preprocessing file needs to be run first, it creates the csv file used here
train_users = pd.read_csv('train_users_merge_scale.csv')
#test_users = pd.read_csv('test_users.csv')


# Extracting labels from the train data
train_users_labels = train_users.loc[:,'country_destination']
print (train_users_labels.head(n=5))

# Extracting attributes from the train data
train_users_attrs = train_users.iloc[:,:-1]
print(train_users_attrs.head(n=5))

train_users = train_users_attrs
print(train_users.columns)
train_users = train_users.drop(['Blackberry'], axis=1)
train_users = train_users.drop(['Opera Phone'], axis=1)
labels_df = pd.DataFrame(train_users_labels)

0    0
1    0
2    1
3    2
4    1
Name: country_destination, dtype: int64
   timestamp_first_active    gender       age  signup_method  signup_flow  \
0               -4.380020 -0.927300 -0.163283      -1.596552    -0.427798   
1               -4.357961  1.058047  0.287705      -1.596552    -0.427798   
2               -4.348661 -0.927300  2.317149       0.628333    -0.035009   
3               -4.303076 -0.927300  0.738692      -1.596552    -0.427798   
4               -4.283949 -0.927300  0.625945       0.628333    -0.427798   

   language  affiliate_channel  affiliate_provider  first_affiliate_tracked  \
0 -0.141579          -0.582242           -0.468760                -0.798954   
1 -0.141579           2.556797            0.251719                -0.798954   
2 -0.141579          -0.582242           -0.468760                -0.798954   
3 -0.141579          -0.582242           -0.468760                -0.798954   
4 -0.141579          -0.582242           -0.468760                -

In [56]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

def dcg_score(y_true, y_score, k=5):

    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


#def ndcg_score(ground_truth, predictions, k=5):
def ndcg_score(te_labels, predict, k):
   
    lb = LabelBinarizer()
    lb.fit(range(12 + 1))
    T = lb.transform(te_labels)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predict):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        if best == 0:
            best = 0.000000001
        score = float(actual) / float(best)
        scores.append(score)
    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)


In [57]:
def folds_to_split(data,targets,train,test):
    data_tr = pd.DataFrame(data).iloc[train]
    data_te = pd.DataFrame(data).iloc[test]
    labels_tr = pd.DataFrame(targets).iloc[train]
    labels_te = pd.DataFrame(targets).iloc[test]
    return [data_tr, data_te, labels_tr, labels_te]

In [44]:
# Logistic Regression : One Versus Rest
# Use validation set to find which solver to use 

from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model, cross_validation

fold_results_ovr = pd.DataFrame()
test_sets = []
test_set_labels = []
train_sets = []
train_set_labels = []
    
def ten_fold_oneVsRest(data, labels):
    
    for train, test in cross_validation.KFold(len(data), n_folds=10, shuffle=True, random_state=20160202):
        [tr_data, te_data,
         tr_target, te_target] = folds_to_split(data, labels,train,test)
        
        train_sets.append(tr_data)
        train_set_labels.append(tr_target)
        test_sets.append(te_data)
        test_set_labels.append(te_target)
        
    foldnum = 0
    solvers = ['newton-cg', 'lbfgs', 'liblinear']
    for x in range(0, 10):
        foldnum+=1
        [bnb_train, bnb_validation, bnb_train_labels, bnb_validation_labels] = cross_validation.train_test_split(
        train_sets[x], 
        train_set_labels[x], 
        test_size=0.11, 
        random_state=20160121)
        
        for sol in solvers:
            oneVsRest = OneVsRestClassifier(linear_model.LogisticRegression(solver=sol))
            #print(tr_target)
            oneVsRest.fit(bnb_train.values, bnb_train_labels[bnb_train_labels.columns.values[0]].values)
            #print(oneVsRest.estimators_)
        
            fold_results_ovr.loc[foldnum, 'Accuracy_' + sol] = oneVsRest.score(bnb_validation, bnb_validation_labels)
        
            predictions = oneVsRest.predict_proba(bnb_validation)
            score = ndcg_score(bnb_validation_labels.as_matrix(), predictions, 5)
            fold_results_ovr.loc[foldnum, 'ndcg_' + sol]  = score
            #print(score)
                
    #Now let's look at the results:
    print ("------OneVsRestClassifier accuracy and ndcg values------")
    print (fold_results_ovr)
    print ("------OneVsRestClassifier mean accuracy, ndcg values------")
    print(fold_results_ovr.mean())
    
ten_fold_oneVsRest(train_users, labels_df)

------OneVsRestClassifier accuracy and ndcg values------
    Accuracy_newton-cg  ndcg_newton-cg  Accuracy_lbfgs  ndcg_lbfgs  \
1             0.607278        0.815000        0.607278    0.815000   
2             0.603587        0.813433        0.603587    0.813436   
3             0.604344        0.812489        0.604344    0.812489   
4             0.602073        0.812163        0.602073    0.812145   
5             0.604581        0.813422        0.604581    0.813422   
6             0.613099        0.817676        0.613099    0.817676   
7             0.610212        0.816933        0.610212    0.816914   
8             0.610117        0.816962        0.610117    0.816962   
9             0.608272        0.815800        0.608272    0.815800   
10            0.608461        0.816193        0.608461    0.816191   

    Accuracy_liblinear  ndcg_liblinear  
1             0.607278        0.815000  
2             0.603587        0.813433  
3             0.604344        0.812489  
4       

In [46]:
fold_results_ovr = pd.DataFrame()
foldnum = 0
for x in range(0, 10):
    foldnum+=1
    oneVsRest = OneVsRestClassifier(linear_model.LogisticRegression(solver='lbfgs', penalty='l2'))
    oneVsRest.fit(train_sets[x].values, train_set_labels[x].values.ravel())
    
    fold_results_ovr.loc[foldnum, 'Accuracy'] = oneVsOne.score(test_sets[x].values,
                                                          test_set_labels[x].values.ravel())
    
    predictions = oneVsRest.predict_proba(test_sets[x].values)
    score = ndcg_score(test_set_labels[x].as_matrix(), predictions, 5)
    fold_results_ovr.loc[foldnum, 'ndcg']  = score
    
print ("------OneVsRestClassifier accuracy and ndcg values------")
print (fold_results_ovr)
print ("------OneVsRestClassifier mean accuracy, ndcg values------")
print(fold_results_ovr.mean())

------OneVsRestClassifier accuracy and ndcg values------
    Accuracy      ndcg
1   0.608076  0.816181
2   0.609885  0.816683
3   0.604685  0.813442
4   0.606840  0.815432
5   0.606606  0.815835
6   0.604872  0.814452
7   0.607121  0.813901
8   0.602670  0.813187
9   0.610588  0.818077
10  0.611525  0.816938
------OneVsRestClassifier mean accuracy, ndcg values------
Accuracy    0.607287
ndcg        0.815413
dtype: float64


In [45]:
# liblinear and lbfgs perform  nearly the same...

fold_results_ovr = pd.DataFrame()
foldnum = 0
for x in range(0, 10):
    foldnum+=1
    oneVsRest = OneVsRestClassifier(linear_model.LogisticRegression(solver='liblinear', penalty='l2'))
    oneVsRest.fit(train_sets[x].values, train_set_labels[x].values.ravel())
    
    fold_results_ovr.loc[foldnum, 'Accuracy'] = oneVsOne.score(test_sets[x].values,
                                                          test_set_labels[x].values.ravel())
    
    predictions = oneVsRest.predict_proba(test_sets[x].values)
    score = ndcg_score(test_set_labels[x].as_matrix(), predictions, 5)
    fold_results_ovr.loc[foldnum, 'ndcg']  = score
    
print ("------OneVsRestClassifier accuracy and ndcg values------")
print (fold_results_ovr)
print ("------OneVsRestClassifier mean accuracy, ndcg values------")
print(fold_results_ovr.mean())

------OneVsRestClassifier accuracy and ndcg values------
    Accuracy      ndcg
1   0.608076  0.816163
2   0.609885  0.816683
3   0.604685  0.813442
4   0.606840  0.815413
5   0.606606  0.815870
6   0.604872  0.814452
7   0.607121  0.813901
8   0.602670  0.813169
9   0.610588  0.818077
10  0.611525  0.816955
------OneVsRestClassifier mean accuracy, ndcg values------
Accuracy    0.607287
ndcg        0.815412
dtype: float64


In [48]:
# liblinear does better with l1 penalty...

fold_results_ovr = pd.DataFrame()
foldnum = 0
for x in range(0, 10):
    foldnum+=1
    oneVsRest = OneVsRestClassifier(linear_model.LogisticRegression(solver='liblinear', penalty='l1'))
    oneVsRest.fit(train_sets[x].values, train_set_labels[x].values.ravel())
    
    fold_results_ovr.loc[foldnum, 'Accuracy'] = oneVsOne.score(test_sets[x].values,
                                                          test_set_labels[x].values.ravel())
    
    predictions = oneVsRest.predict_proba(test_sets[x].values)
    score = ndcg_score(test_set_labels[x].as_matrix(), predictions, 5)
    fold_results_ovr.loc[foldnum, 'ndcg']  = score
    
print ("------OneVsRestClassifier accuracy and ndcg values------")
print (fold_results_ovr)
print ("------OneVsRestClassifier mean accuracy, ndcg values------")
print(fold_results_ovr.mean())

------OneVsRestClassifier accuracy and ndcg values------
    Accuracy      ndcg
1   0.608076  0.816199
2   0.609885  0.816708
3   0.604685  0.813442
4   0.606840  0.815416
5   0.606606  0.815858
6   0.604872  0.814483
7   0.607121  0.813844
8   0.602670  0.813170
9   0.610588  0.818057
10  0.611525  0.816993
------OneVsRestClassifier mean accuracy, ndcg values------
Accuracy    0.607287
ndcg        0.815417
dtype: float64


In [49]:
# Logistic Regression : One Versus One
# Use validation set to find which solver to use 

from sklearn.multiclass import OneVsOneClassifier
fold_results_ovo = pd.DataFrame()
test_sets = []
test_set_labels = []
train_sets = []
train_set_labels = []

def ten_fold_oneVsOne(data, labels):
    foldnum = 0
    solvers = ['newton-cg', 'lbfgs', 'liblinear']
    
    for train, test in cross_validation.KFold(len(data), n_folds=10, shuffle=True):
        [tr_data, te_data,
         tr_target, te_target] = folds_to_split(data, labels,train,test)
        
        train_sets.append(tr_data)
        train_set_labels.append(tr_target)
        test_sets.append(te_data)
        test_set_labels.append(te_target)
    
    foldnum = 0
    for x in range(0, 10):
        foldnum+=1
        [bnb_train, bnb_validation, bnb_train_labels, bnb_validation_labels] = cross_validation.train_test_split(
        train_sets[x], 
        train_set_labels[x], 
        test_size=0.11, 
        random_state=20160121)
        for sol in solvers:
            oneVsOne = OneVsOneClassifier(linear_model.LogisticRegression(solver=sol))
            oneVsOne.fit(bnb_train.values, bnb_train_labels[bnb_train_labels.columns.values[0]].values)
        
            columnname = "Accuracy" + "_" + sol
            fold_results_ovo.loc[foldnum, columnname] = oneVsOne.score(bnb_validation, bnb_validation_labels)
    
    #Now let's look at the results:
    print ("-----------OneVsOneClassifier results for accuracy-----------")
    print (fold_results_ovo)
    
    print ("--------------OneVsOneClassifier mean accuracy----------------")
    print(fold_results_ovo.mean())
    
ten_fold_oneVsOne(train_users, labels_df)

-----------OneVsOneClassifier results for accuracy-----------
    Accuracy_newton-cg  Accuracy_lbfgs  Accuracy_liblinear
1             0.604770        0.604770            0.604770
2             0.609218        0.609218            0.609218
3             0.608556        0.608556            0.608556
4             0.605622        0.605622            0.605622
5             0.612152        0.612152            0.612152
6             0.605338        0.605338            0.605338
7             0.602499        0.602499            0.602499
8             0.610969        0.611016            0.610969
9             0.608840        0.608840            0.608840
10            0.605858        0.605858            0.605858
--------------OneVsOneClassifier mean accuracy----------------
Accuracy_newton-cg    0.607382
Accuracy_lbfgs        0.607387
Accuracy_liblinear    0.607382
dtype: float64




In [51]:
# lbfgs does best every time...

fold_results_ovo = pd.DataFrame()
foldnum = 0
for x in range(0, 10):
    
    foldnum+=1
    oneVsOne = OneVsOneClassifier(linear_model.LogisticRegression(solver='lbfgs'))
    oneVsOne.fit(train_sets[x].values, train_set_labels[x].values.ravel())
    
    fold_results_ovo.loc[foldnum, 'Accuracy'] = oneVsOne.score(test_sets[x].values,
                                                          test_set_labels[x].values.ravel())
    

print ("-----------OneVsOneClassifier results for accuracy-----------")
print (fold_results_ovo)
print ("--------------OneVsOneClassifier mean accuracy----------------")
print(fold_results_ovo.mean())

-----------OneVsOneClassifier results for accuracy-----------
    Accuracy
1   0.602080
2   0.607309
3   0.608855
4   0.607871
5   0.605247
6   0.600609
7   0.608199
8   0.612087
9   0.609932
10  0.610354
--------------OneVsOneClassifier mean accuracy----------------
Accuracy    0.607254
dtype: float64


In [58]:
# Use ADA boost with logistsic regression

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.multiclass import OneVsRestClassifier
from sklearn import linear_model, cross_validation

test_sets = []
test_set_labels = []
train_sets = []
train_set_labels = []
fold_results_ada = pd.DataFrame()
def ten_fold_ada_logistic(data, labels):
    foldnum = 0
    fold_results_ovr_est = pd.DataFrame()
    for train, test in cross_validation.KFold(len(data), n_folds=10, shuffle=True, random_state=20160202):
        
        [tr_data, te_data,
         tr_target, te_target] = folds_to_split(data, labels,train,test)
        train_sets.append(tr_data)
        train_set_labels.append(tr_target)
        test_sets.append(te_data)
        test_set_labels.append(te_target)
    
    solvers = ['newton-cg', 'lbfgs']
    estimators = [100, 200, 250]
    foldnum = 0
    for x in range(0, 10):
        foldnum+=1
        [bnb_train, bnb_validation, bnb_train_labels, bnb_validation_labels] = cross_validation.train_test_split(
            train_sets[x], 
            train_set_labels[x], 
            test_size=0.11, 
            random_state=20160121)
        for sol in solvers:
            for est in estimators:
                
                [tr_data, te_data,
                 tr_target, te_target] = folds_to_split(data, labels,train,test)
                ada = AdaBoostClassifier(learning_rate=0.3, n_estimators=est, 
                                     base_estimator=linear_model.LogisticRegression(solver=sol), 
                                     algorithm='SAMME')  
                #print(tr_target)
                ada.fit(bnb_train.values, bnb_train_labels[bnb_train_labels.columns.values[0]].values)
                #print(oneVsRest.estimators_)
        
                fold_results_ada.loc[foldnum, 'Accuracy_' + sol + '_' + str(est)] = ada.score(bnb_validation, 
                                                                                            bnb_validation_labels)
        
                predictions = ada.predict_proba(bnb_validation)
                score = ndcg_score(bnb_validation_labels.as_matrix(), predictions, 5)
                fold_results_ada.loc[foldnum, 'ndcg_' + sol + '_' + str(est)]  = score
                print(str(score) + ":" + " solver " + sol + ", estimators " + str(est))
            
    
    #Now let's look at the results:
    print ("------Ada accuracy and ndcg values------")
    print (fold_results_ada)
    print ("------Ada mean accuracy and ndcg values------")
    print(fold_results_ada.mean())
    
ten_fold_ada_logistic(train_users, labels_df)

0.806944035724: solver newton-cg, estimators 100
0.806926570731: solver newton-cg, estimators 200
0.806926570731: solver newton-cg, estimators 250
0.806926570731: solver lbfgs, estimators 100
0.806909105737: solver lbfgs, estimators 200
0.806926570731: solver lbfgs, estimators 250
0.8052099633: solver newton-cg, estimators 100
0.8052099633: solver newton-cg, estimators 200
0.8052099633: solver newton-cg, estimators 250
0.8052099633: solver lbfgs, estimators 100
0.805244893287: solver lbfgs, estimators 200
0.805244893287: solver lbfgs, estimators 250
0.803373399797: solver newton-cg, estimators 100
0.80339086479: solver newton-cg, estimators 200
0.80339086479: solver newton-cg, estimators 250
0.80339086479: solver lbfgs, estimators 100
0.803408329784: solver lbfgs, estimators 200
0.803408329784: solver lbfgs, estimators 250
0.804220364724: solver newton-cg, estimators 100
0.804237829718: solver newton-cg, estimators 200
0.804237829718: solver newton-cg, estimators 250
0.804150504749: so

In [27]:
# The best value is for newton-cg, estimators: 250

foldnum = 0
for x in range(0, 10):
    # one fourth of the training data is for validation
    foldnum+=1
    ada = AdaBoostClassifier(learning_rate=0.3, n_estimators=250, 
                                     base_estimator=linear_model.LogisticRegression(solver='newton-cg'), 
                                     algorithm='SAMME') 
    
    ada.fit(train_sets[x].values, train_set_labels[x].values.ravel())
    
    fold_results_ada.loc[foldnum, 'Accuracy'] = ada.score(test_sets[x].values,
                                                          test_set_labels[x].values.ravel())
        
    predictions = ada.predict_proba(test_sets[x].values)
    score = ndcg_score(test_set_labels[x].as_matrix(), predictions, 5)
    fold_results_ada.loc[foldnum, 'ndcg']  = score
    

print ("------Ada accuracy and ndcg values------")
print (fold_results_ada)
print ("------Ada mean accuracy and ndcg values------")
print(fold_results_ada.mean())

------Ada accuracy and ndcg values------
    Accuracy      ndcg
1   0.598426  0.808153
2   0.604825  0.809260
3   0.600141  0.805607
4   0.601265  0.807822
5   0.604170  0.807697
6   0.598688  0.806826
7   0.605107  0.805142
8   0.593769  0.804849
9   0.604029  0.810024
10  0.604966  0.809372
------Ada mean accuracy and ndcg values------
Accuracy    0.601539
ndcg        0.807475
dtype: float64


In [68]:
# Perceptron

from sklearn.multiclass import OneVsRestClassifier
fold_results_perc = pd.DataFrame()
test_sets = []
test_set_labels = []
train_sets = []
train_set_labels = []

def ten_fold_oneVsRest_Perc(data, labels):
    foldnum = 0
    iters = [20, 50, 100, 200, 500]
    pen = ['l1', 'l2', 'None']
    
    for train, test in cross_validation.KFold(len(data), n_folds=10, shuffle=True):
        [tr_data, te_data,
         tr_target, te_target] = folds_to_split(data, labels,train,test)
        
        train_sets.append(tr_data)
        train_set_labels.append(tr_target)
        test_sets.append(te_data)
        test_set_labels.append(te_target)
    
    foldnum = 0
    for x in range(0, 10):
        foldnum+=1
        [bnb_train, bnb_validation, bnb_train_labels, bnb_validation_labels] = cross_validation.train_test_split(
        train_sets[x], 
        train_set_labels[x], 
        test_size=0.11, 
        random_state=20160121)
        for i in iters:
            for p in pen: 
                oneVsRest = OneVsRestClassifier(linear_model.Perceptron(penalty=p, n_iter=i))
                oneVsRest.fit(bnb_train.values, bnb_train_labels[bnb_train_labels.columns.values[0]].values)
        
                columnname = "Accuracy_pen" + p + "_numIter" + str(i)
                score = oneVsRest.score(bnb_validation, bnb_validation_labels)
                fold_results_perc.loc[foldnum, columnname] = score
                print(columnname + ": " + str(score))
    
    #Now let's look at the results:
    print ("-----------OneVsRestClassifier Perceptron results for accuracy-----------")
    print (fold_results_perc)
    
    print ("--------------OneVsRestClassifier Perceptron mean accuracy----------------")
    print(fold_results_perc.mean())
    
ten_fold_oneVsRest_Perc(train_users, labels_df)

Accuracy_penl1_numIter20: 0.409331819042
Accuracy_penl2_numIter20: 0.389977285633
Accuracy_penNone_numIter20: 0.428165814878
Accuracy_penl1_numIter50: 0.307117168276
Accuracy_penl2_numIter50: 0.453340904789
Accuracy_penNone_numIter50: 0.396886238879
Accuracy_penl1_numIter100: 0.529765284876
Accuracy_penl2_numIter100: 0.523566155593
Accuracy_penNone_numIter100: 0.515710770396
Accuracy_penl1_numIter200: 0.451779292069
Accuracy_penl2_numIter200: 0.442551580541
Accuracy_penNone_numIter200: 0.437488169601
Accuracy_penl1_numIter500: 0.599375354912
Accuracy_penl2_numIter500: 0.536816202915
Accuracy_penNone_numIter500: 0.557259133068
Accuracy_penl1_numIter20: 0.526310808253
Accuracy_penl2_numIter20: 0.558820745788
Accuracy_penNone_numIter20: 0.53653227333
Accuracy_penl1_numIter50: 0.474399015711
Accuracy_penl2_numIter50: 0.502082150293
Accuracy_penNone_numIter50: 0.485377626349
Accuracy_penl1_numIter100: 0.561612720045
Accuracy_penl2_numIter100: 0.523660798789
Accuracy_penNone_numIter100: 0.51

In [69]:
# The best value is for penalty = None, iterations: 200 : 0.500142

fold_results_perc = pd.DataFrame()
foldnum = 0
for x in range(0, 10):
    # one fourth of the training data is for validation
    foldnum+=1
    oneVsRest = OneVsRestClassifier(linear_model.Perceptron(penalty='None', n_iter=200))
    
    oneVsRest.fit(train_sets[x].values, train_set_labels[x].values.ravel())
    
    fold_results_perc.loc[foldnum, 'Accuracy'] = oneVsRest.score(test_sets[x].values,
                                                                 test_set_labels[x].values.ravel())
    

print ("------Perceptron accuracy values------")
print (fold_results_perc)
print ("------Perceptron mean accuracy values------")
print(fold_results_perc.mean())

------Perceptron accuracy values------
    Accuracy
1   0.498079
2   0.497072
3   0.511361
4   0.510658
5   0.499930
6   0.487187
7   0.480159
8   0.432748
9   0.483439
10  0.316936
------Perceptron mean accuracy values------
Accuracy    0.471757
dtype: float64


In [None]:
# Not completing, takes forever to run

from sklearn import svm
    
def ten_fold_svm(data, labels):
    foldnum = 0
    fold_results_ovr_est = pd.DataFrame()
    fold_results_ada = pd.DataFrame()
    for train, test in cross_validation.KFold(len(data), n_folds=10, shuffle=True, random_state=20160202):
        foldnum+=1
        [tr_data, te_data,
         tr_target, te_target] = folds_to_split(data, labels,train,test)
        vec_mach = svm.SVC(kernel='rbf', random_state=20160202)
        #print(tr_target)
        vec_mach.fit(tr_data.values, tr_target[tr_target.columns.values[0]].values)
        
        fold_results_ada.loc[foldnum, 'Accuracy'] = vec_mach.score(te_data, te_target)
        
        predictions = vec_mach.predict_proba(te_data)
        score = ndcg_score(te_target.as_matrix(), predictions, 5)
        fold_results_ada.loc[foldnum, 'ndcg']  = score
        print(score)
            
    
    #Now let's look at the results:
    print ("------svm accuracy values------")
    print (fold_results_ovr)
    print ("------svm mean accuracy values------")
    print(fold_results_ovr.mean())
    
ten_fold_svm(train_users, labels_df)