In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,  make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score, accuracy_score, plot_roc_curve, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.dummy import DummyClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

In [2]:
churn_df = pd.read_csv('./Data/bigml_59c28831336c6604c800002a.csv')

In [3]:
churn_df.columns = churn_df.columns.str.title() 

In [4]:
churn_df.columns = churn_df.columns.str.replace(' ','')

In [5]:
churn_df.head()

Unnamed: 0,State,AccountLength,AreaCode,PhoneNumber,InternationalPlan,VoiceMailPlan,NumberVmailMessages,TotalDayMinutes,TotalDayCalls,TotalDayCharge,...,TotalEveCalls,TotalEveCharge,TotalNightMinutes,TotalNightCalls,TotalNightCharge,TotalIntlMinutes,TotalIntlCalls,TotalIntlCharge,CustomerServiceCalls,Churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [6]:
X = churn_df.drop(['PhoneNumber', 'Churn', 'InternationalPlan', 'VoiceMailPlan', 'State', 'AreaCode'], axis =1)
y = churn_df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=105151)

In [7]:
ss = StandardScaler()
X_train_s = ss.fit_transform(X_train)
X_test_s = ss.transform(X_test)

In [8]:
knn = KNeighborsClassifier()
knn_log_loss = -cross_val_score(knn, X_train_s, y_train, scoring="neg_log_loss").mean()
knn_log_loss

1.5883746935919456

In [9]:
knn_2 = KNeighborsClassifier(n_neighbors = 50)
knn_log_loss2 = -cross_val_score(knn_2, X_train_s, y_train, scoring="neg_log_loss").mean()
knn_log_loss2

0.3324693850655061

In [10]:
knn_3 = KNeighborsClassifier(n_neighbors = 50, metric="manhattan")
knn_log_loss3 = -cross_val_score(knn_3, X_train_s, y_train, scoring="neg_log_loss").mean()
knn_log_loss3

0.34571502741034765

In [11]:
knn_4 = KNeighborsClassifier(n_neighbors = 75, metric="manhattan")
knn_log_loss4 = -cross_val_score(knn_4, X_train_s, y_train, scoring="neg_log_loss").mean()
knn_log_loss4

0.3282636577890778

In [12]:
forest = RandomForestClassifier()
forest.fit(X_train_s, y_train)
tree = -cross_val_score(forest, X_train_s, y_train, scoring="neg_log_loss").mean()
tree

0.33582107040300213

In [13]:
treeees = forest.feature_importances_

In [18]:
tree_feat_import = {name: score 
                   for name, score 
                       in zip(X_train.columns, forest.feature_importances_)
}
tree_feat_import

{'AccountLength': 0.03853741726736299,
 'NumberVmailMessages': 0.039597775949360724,
 'TotalDayMinutes': 0.15390041261878004,
 'TotalDayCalls': 0.04347265681343122,
 'TotalDayCharge': 0.1458235909413062,
 'TotalEveMinutes': 0.07790769189311605,
 'TotalEveCalls': 0.04055860034407498,
 'TotalEveCharge': 0.07613157431784906,
 'TotalNightMinutes': 0.047396682899746086,
 'TotalNightCalls': 0.04218640575675038,
 'TotalNightCharge': 0.04774983296279101,
 'TotalIntlMinutes': 0.04650637077766495,
 'TotalIntlCalls': 0.02977570011082398,
 'TotalIntlCharge': 0.04684238345867602,
 'CustomerServiceCalls': 0.12361290388826644}

In [19]:
tree_values = []
tree_keys = []
for name in tree_feat_import:
    if tree_feat_import[name] > .05:
        tree_keys.append(name)
        tree_values.append(tree_feat_import[name])

In [20]:
tree_keys

['TotalDayMinutes',
 'TotalDayCharge',
 'TotalEveMinutes',
 'TotalEveCharge',
 'CustomerServiceCalls']