In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
style.use('seaborn-whitegrid')
%matplotlib inline
import pickle
from collections import defaultdict

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pd.set_option('max_rows', 10)
pd.set_option('max_columns', 30)
pd.set_option('max_colwidth', -1)
# pd.set_option('display.width', 150)

In [2]:
# import warnings
# # warnings.filterwarnings("ignore")

In [5]:
# # Change format of charts to .svg
%config InlineBackend.figure_format = 'svg'

In [3]:
%xmode

Exception reporting mode: Plain


In [15]:
df = pd.read_csv('data/clean_data_encoded_for_regression.csv')

X = df[['gender', 'senior', 'partner', 'dependents', 'tenure', 'phone_service',
       'multiple_lines', 'online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
       'paperless_billing', 'monthly_charges', 'total_charges',
       'avg_monthly_charges', 'internet_service-fiber_optic',
       'internet_service-no', 'contract-one_year', 'contract-two_year',
       'payment_method-credit_card_auto', 'payment_method-electronic_check',
       'payment_method-mailed_check']]

y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=40)

In [18]:
def quick_test(model, X, y):
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3)
    model.fit(xtrain, ytrain)
    return model.score(xtrain,ytrain),model.score(xtest, ytest)

def quick_test_a_few_times(model, X, y, n=10):
    return (pd.DataFrame([quick_test(model, X, y) for j in range(n)],
                         columns=['Train Acc','Test Acc'])).mean()

linearsvc = LinearSVC()
# Do the test 10 times with a LinearSVC and get the average score
print(quick_test_a_few_times(linearsvc, X_train, y_train))

Train Acc    0.664119
Test Acc     0.669731
dtype: float64


In [19]:
decisiontree = DecisionTreeClassifier()
quick_test_a_few_times(decisiontree, X_train, y_train)

Train Acc    0.998574
Test Acc     0.727864
dtype: float64

In [20]:
randomforest = RandomForestClassifier()
quick_test_a_few_times(randomforest, X_train, y_train)

Train Acc    0.981104
Test Acc     0.794059
dtype: float64

In [21]:
gradboost = GradientBoostingClassifier()
quick_test_a_few_times(gradboost, X_train, y_train)

Train Acc    0.846103
Test Acc     0.806223
dtype: float64

In [22]:
logreg = LogisticRegression()
quick_test_a_few_times(logreg, X_train, y_train)

Train Acc    0.811495
Test Acc     0.808628
dtype: float64

In [23]:
linearsvc = LinearSVC()
quick_test_a_few_times(linearsvc, X_train, y_train)

Train Acc    0.635729
Test Acc     0.637765
dtype: float64

#### Using a kernel can do better:

In [65]:
model = make_pipeline(StandardScaler(), SVC())
model.fit(X_train, y_train)
pred = model.predict(X_test)

quick_test_a_few_times(model, X_train, y_train)

Train Acc    0.83391
Test Acc     0.80198
dtype: float64

In [70]:
decisiontree = DecisionTreeClassifier()
decisiontree.fit(X_train, y_train)
coeffs = decisiontree.feature_importances_ # it's a percentage

list_of_coeffs = list(sorted(zip(coeffs, X.columns)))

In [71]:
import pprint
pprint.pprint(list_of_coeffs)

[(0.0012168986905327964, 'phone_service'),
 (0.0076606922626349105, 'streaming_movies'),
 (0.008747912285878073, 'internet_service-no'),
 (0.00927705788162022, 'payment_method-mailed_check'),
 (0.010001372110225597, 'contract-two_year'),
 (0.010195375741206061, 'payment_method-credit_card_auto'),
 (0.010575487520069019, 'online_backup'),
 (0.01081794950111941, 'tech_support'),
 (0.011230398175992709, 'device_protection'),
 (0.011578179834434047, 'dependents'),
 (0.012249369989251028, 'contract-one_year'),
 (0.013363744074293795, 'online_security'),
 (0.013752511429743354, 'streaming_tv'),
 (0.015882834539806247, 'senior'),
 (0.01603643664716522, 'partner'),
 (0.016388494005221448, 'multiple_lines'),
 (0.01808830427107981, 'paperless_billing'),
 (0.024198036503276527, 'gender'),
 (0.04240045131079471, 'payment_method-electronic_check'),
 (0.09384601872053809, 'internet_service-fiber_optic'),
 (0.1318047098051532, 'monthly_charges'),
 (0.14380464972645415, 'total_charges'),
 (0.159990834

In [72]:
randomforest = RandomForestClassifier()
randomforest.fit(X_train, y_train)
coeffs = randomforest.feature_importances_

list_of_coeffs = list(sorted(zip(coeffs, X.columns)))
pprint.pprint(list_of_coeffs)

[(0.005866661481227731, 'phone_service'),
 (0.010861471069343234, 'payment_method-credit_card_auto'),
 (0.010978763432789496, 'payment_method-mailed_check'),
 (0.012061918826816976, 'streaming_movies'),
 (0.012923148086394132, 'internet_service-no'),
 (0.015416559086757004, 'device_protection'),
 (0.016420046728377845, 'dependents'),
 (0.0164386606008635, 'multiple_lines'),
 (0.01688470555390193, 'senior'),
 (0.017442117811329733, 'streaming_tv'),
 (0.01770883329409794, 'tech_support'),
 (0.0178021811937957, 'online_backup'),
 (0.02059063619546222, 'online_security'),
 (0.0209875141429226, 'contract-one_year'),
 (0.021881273708861847, 'partner'),
 (0.02266208755969698, 'gender'),
 (0.02747772548215748, 'paperless_billing'),
 (0.03216403982206648, 'contract-two_year'),
 (0.04281275043490627, 'internet_service-fiber_optic'),
 (0.045427035470715825, 'payment_method-electronic_check'),
 (0.13740279137472816, 'monthly_charges'),
 (0.1428683656919248, 'avg_monthly_charges'),
 (0.147747149756

In [20]:
gradboost = GradientBoostingClassifier()
gradboost.fit(xtrain, ytrain)
gradboost.feature_importances_ # values are between 0 and 1, not normalized

array([ 0.51210253,  0.24789747])

In [21]:
s2 = LinearSVC()
s2.fit(xtrain, ytrain)
s2.coef_ # the absolute value dictates importance

array([[-0.17885376, -0.48240484]])

In [22]:
logreg = LogisticRegression()
logreg.fit(xtrain, ytrain)
logreg.coef_ # the abosulute value dictates importance

array([[-0.33875448, -0.88451115]])