## 3.6. Nearest Centroid

In [15]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [14]:
#writing the functions
import statsmodels.api as sm
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.metrics import roc_curve
from sklearn.neighbors.nearest_centroid import NearestCentroid

In [13]:
#Metics on Train
#cross valid should be applied only for train**
def crossvalid(data_train, target_train, est, nsplit):
    data_train_val=data_train.values
    n=data_train.shape[0]
    target_train_val=np.reshape(target_train.values,[n,])

    cross_val = StratifiedKFold(n_splits=nsplit, shuffle=True, random_state=42)
    acc_each_split = cross_val_score(estimator=est, X=data_train_val, y=target_train_val, cv=5, scoring='accuracy')
    return acc_each_split.mean()

In [12]:
#Metrics on Test
def confusion_matrix_report(y_true, y_pred):
    cm, labels = confusion_matrix(y_true, y_pred), unique_labels(y_true, y_pred)
    column_width = max([len(str(x)) for x in labels] + [5])  # 5 is value length
    report = " " * column_width + " " + "{:_^{}}".format("Prediction", column_width * len(labels))+ "\n"
    report += " " * column_width + " ".join(["{:>{}}".format(label, column_width) for label in labels]) + "\n"
    for i, label1 in enumerate(labels):
        report += "{:>{}}".format(label1, column_width) + " ".join(["{:{}d}".format(cm[i, j], column_width) for j in range(len(labels))]) + "\n"
    return report

def loggloss(target_test, model, data_test):
    probabilities=model.predict_proba(data_test)
    value=log_loss(target_test, probabilities)
    return value
    
def AUC(target_test, model, data_test):
    values=model.predict_proba(data_test)[:,1]
    auc_score=roc_auc_score(target_test, values)
    return auc_score
    
def analytics(target_test, model, data_test):#target of the test data #predictions as 0,1 #model (knnclassifier) #data_test
    y_pred=model.predict(data_test)
    print("Confusion Matrix:")
    print(confusion_matrix_report(target_test,y_pred))
    print("Accuracy Score:")
    print(accuracy_score(target_test,y_pred))
    print("Classification Report:")
    print(classification_report(target_test,y_pred))
    if(model!=nearest_centroid):
        print("Log Loss:")
        print(loggloss(target_test, model, data_test))
        print("AUC Score:")
        print(AUC(target_test, model, data_test))

In [5]:
train_data=pd.read_csv('final_train_data_StdSc.csv')
test_data=pd.read_csv('final_test_data_StdSc.csv')

In [6]:
train_data_all=train_data.drop(['msno','is_churn','Unnamed: 0'],axis=1)#this dataset shouldn't include msno and in_churn
train_target=train_data['is_churn']

In [7]:
test_target=test_data['is_churn']
test_data=test_data.drop(['msno','is_churn','Unnamed: 0'],axis=1)#this dataset shouldn't include msno and in_churn

### 3.6.1. Nearest Centroid-All Selected Variables

In [10]:
data_indepen_m1=train_data_all[['number_of_days_201702_listened','is_auto_renew','total_cancel','active_days','avg_actual_amount_paid','most_fq_payment_method_id','num_75_mean','number_of_days_listened','num_25_201702_sum','num_50_201702_sum','num_100_201702_sum','num_25_201702_mean','num_50_201702_mean','num_totalsec_lasttwo_mean','total_churn','registered_via_3','registered_via_4','registered_via_7','bd','num_totalsec_sum','num_100_med','num_25_max','num_unq_max','num_totalsec_max','num_totalsec_min','num_985_201702_sum','num_75_201702_mean','num_unq_201702_mean','num_unq_201701_sum','num_25_201612_mean','num_100_201612_mean','num_75_lasttwo_mean','num_50_lastthree_mean','proportion_songs_above_50','registered_via_9']]

In [11]:
test_data_m1=test_data[['number_of_days_201702_listened','is_auto_renew','total_cancel','active_days','avg_actual_amount_paid','most_fq_payment_method_id','num_75_mean','number_of_days_listened','num_25_201702_sum','num_50_201702_sum','num_100_201702_sum','num_25_201702_mean','num_50_201702_mean','num_totalsec_lasttwo_mean','total_churn','registered_via_3','registered_via_4','registered_via_7','bd','num_totalsec_sum','num_100_med','num_25_max','num_unq_max','num_totalsec_max','num_totalsec_min','num_985_201702_sum','num_75_201702_mean','num_unq_201702_mean','num_unq_201701_sum','num_25_201612_mean','num_100_201612_mean','num_75_lasttwo_mean','num_50_lastthree_mean','proportion_songs_above_50','registered_via_9']]

In [12]:
nearest_centroid = NearestCentroid()
nearest_centroid.fit(data_indepen_m1, train_target)
predictions = nearest_centroid.predict(test_data_m1)

In [13]:
analytics(test_target, nearest_centroid, test_data_m1)

Confusion Matrix:
      Prediction
         0     1
    0213726 31824
    1 3451 13842

Accuracy Score:
0.865794409591
Classification Report:
             precision    recall  f1-score   support

          0       0.98      0.87      0.92    245550
          1       0.30      0.80      0.44     17293

avg / total       0.94      0.87      0.89    262843



### 3.6.2. Nearest Centroid 4 Ranked Features

In [61]:
cols=['number_of_days_201702_listened','is_auto_renew','total_cancel','active_days','avg_actual_amount_paid']
data_indepen_m2=train_data_all[cols]
test_data_m2=test_data[cols]

In [23]:
nearest_centroid = NearestCentroid()
nearest_centroid.fit(data_indepen_m2, train_target)
predictions = nearest_centroid.predict(test_data_m2)
#print("nearest_centroid: acc: {}".format(accuracy_score(data_test, predictions)))

In [25]:
analytics(test_target, nearest_centroid, test_data_m2)

Confusion Matrix:
      Prediction
         0     1
    0218822 26728
    1 2435 14858

Accuracy Score:
0.889047834639
Classification Report:
             precision    recall  f1-score   support

          0       0.99      0.89      0.94    245550
          1       0.36      0.86      0.50     17293

avg / total       0.95      0.89      0.91    262843



### 3.6.3. Nearest Centroid 2,3 and 4th Ranked Features

In [63]:
cols_4=['number_of_days_201702_listened',
'is_auto_renew',
'total_cancel',
'active_days',
'avg_actual_amount_paid',
'most_fq_payment_method_id',
'num_75_mean',
'number_of_days_listened',
'num_25_201702_sum',
'num_50_201702_sum',
'num_100_201702_sum',
'num_25_201702_mean',
'num_50_201702_mean',
'num_totalsec_lasttwo_mean',
'total_churn',
'registered_via_3',
'registered_via_4',
'registered_via_7']
data_indepen_m4=train_data_all[cols_4]
test_data_m4=test_data[cols_4]

In [31]:
nearest_centroid = NearestCentroid()
nearest_centroid.fit(data_indepen_m4, train_target)
predictions = nearest_centroid.predict(test_data_m4)

In [32]:
analytics(test_target, nearest_centroid, test_data_m4)

Confusion Matrix:
      Prediction
         0     1
    0217018 28532
    1 3446 13847

Accuracy Score:
0.878338019274
Classification Report:
             precision    recall  f1-score   support

          0       0.98      0.88      0.93    245550
          1       0.33      0.80      0.46     17293

avg / total       0.94      0.88      0.90    262843



### 3.6.4. Nearest Centroid F Classification

In [16]:
cols_5=['num_25_201702_sum','num_50_201702_sum','num_75_201702_sum','num_985_201702_sum','num_100_201702_sum','num_unq_201702_sum','num_totalsec_201702_sum','number_of_days_201702_listened','number_of_days_lasttwo_listened','number_of_days_lastthree_listened','total_churn','is_auto_renew','total_cancel','active_days','avg_plan_list_price','avg_actual_amount_paid','gender_2','registered_via_3','registered_via_4','registered_via_7']
data_indepen_m5=train_data_all[cols_5]
test_data_m5=test_data[cols_5]

In [20]:
nearest_centroid = NearestCentroid()
nearest_centroid.fit(data_indepen_m5, train_target)
predictions = nearest_centroid.predict(test_data_m5)

In [21]:
analytics(test_target, nearest_centroid, test_data_m5)

Confusion Matrix:
      Prediction
         0     1
    0216445 29105
    1 3702 13591

Accuracy Score:
0.875184045229
Classification Report:
             precision    recall  f1-score   support

          0       0.98      0.88      0.93    245550
          1       0.32      0.79      0.45     17293

avg / total       0.94      0.88      0.90    262843



### 3.6.5. Nearest Centroid PCA

In [19]:
cols_6=['num_100_mean','num_100_lastthree_mean','num_totalsec_mean','num_totalsec_lastthree_mean','num_100_med','num_100_lasttwo_mean','num_totalsec_med','num_totalsec_lasttwo_mean','num_75_lastthree_mean','num_50_lastthree_mean','num_100_lastthree_sum','num_100_sum','num_totalsec_lastthree_sum','num_75_mean','num_totalsec_sum','num_above_50_sum','num_100_201701_mean','num_100_201612_mean','num_50_mean','num_75_lasttwo_mean']
data_indepen_m6=train_data_all[cols_6]
test_data_m6=test_data[cols_6]

In [20]:
nearest_centroid = NearestCentroid()
nearest_centroid.fit(data_indepen_m6, train_target)
predictions = nearest_centroid.predict(test_data_m6)

In [21]:
analytics(test_target, nearest_centroid, test_data_m6)

Confusion Matrix:
      Prediction
         0     1
    094121 151429
    1 5453 11840

Accuracy Score:
0.403134190372
Classification Report:
             precision    recall  f1-score   support

          0       0.95      0.38      0.55    245550
          1       0.07      0.68      0.13     17293

avg / total       0.89      0.40      0.52    262843



### 3.6.6. Nearest Centroid Mutual Info

In [22]:
cols_7=['num_50_201702_sum','num_75_201702_sum','num_100_201702_sum','num_unq_201702_sum','num_totalsec_201702_sum','num_25_201702_mean','num_50_201702_mean','num_75_201702_mean','num_100_201702_mean','num_unq_201702_mean','number_of_days_201702_listened','number_of_days_lasttwo_listened','most_fq_payment_method_id','total_churn','is_auto_renew','total_cancel','active_days','avg_plan_list_price','avg_actual_amount_paid','registered_via_7']
data_indepen_m7=train_data_all[cols_7]
test_data_m7=test_data[cols_7]

In [23]:
nearest_centroid = NearestCentroid()
nearest_centroid.fit(data_indepen_m7, train_target)
predictions = nearest_centroid.predict(test_data_m7)

In [24]:
analytics(test_target, nearest_centroid, test_data_m7)

Confusion Matrix:
      Prediction
         0     1
    0217186 28364
    1 3457 13836

Accuracy Score:
0.878935334021
Classification Report:
             precision    recall  f1-score   support

          0       0.98      0.88      0.93    245550
          1       0.33      0.80      0.47     17293

avg / total       0.94      0.88      0.90    262843



### 3.6.7. Nearest Centoid RFE

In [25]:
cols_8=['number_of_days_listened','num_totalsec_201702_sum','num_totalsec_201702_mean','number_of_days_201702_listened','num_unq_201701_sum','number_of_days_201701_listened','num_unq_lasttwo_sum','num_100_lasttwo_mean','num_totalsec_lasttwo_mean','number_of_days_lasttwo_listened','number_of_days_lastthree_listened','most_fq_payment_method_id','is_auto_renew','total_cancel','active_days','avg_plan_list_price','avg_actual_amount_paid','registered_via_3','registered_via_4','registered_via_9']
data_indepen_m8=train_data_all[cols_8]
test_data_m8=test_data[cols_8]

In [26]:
nearest_centroid = NearestCentroid()
nearest_centroid.fit(data_indepen_m8, train_target)
predictions = nearest_centroid.predict(test_data_m8)

In [27]:
analytics(test_target, nearest_centroid, test_data_m8)

Confusion Matrix:
      Prediction
         0     1
    0217312 28238
    1 3273 14020

Accuracy Score:
0.880114745304
Classification Report:
             precision    recall  f1-score   support

          0       0.99      0.89      0.93    245550
          1       0.33      0.81      0.47     17293

avg / total       0.94      0.88      0.90    262843



### 3.6.8. Nearest Centroid with Random Forrest

In [28]:
cols_9=['active_days','is_auto_renew','avg_actual_amount_paid','total_cancel','number_of_days_201702_listened','avg_plan_list_price','most_fq_payment_method_id','num_totalsec_min','number_of_days_listened','bd','num_25_max','num_25_201702_sum','num_unq_max','num_25_201612_mean','num_totalsec_max','proportion_songs_above_50','num_75_mean','num_unq_201702_sum','num_25_201702_mean','num_50_201702_mean']
data_indepen_m9=train_data_all[cols_9]
test_data_m9=test_data[cols_9]

In [29]:
nearest_centroid = NearestCentroid()
nearest_centroid.fit(data_indepen_m9, train_target)
predictions = nearest_centroid.predict(test_data_m9)

In [30]:
analytics(test_target, nearest_centroid, test_data_m9)

Confusion Matrix:
      Prediction
         0     1
    0220105 25445
    1 3187 14106

Accuracy Score:
0.891068052031
Classification Report:
             precision    recall  f1-score   support

          0       0.99      0.90      0.94    245550
          1       0.36      0.82      0.50     17293

avg / total       0.94      0.89      0.91    262843

