In [1]:
import numpy as np
import pandas as pd
import pandasql as psql

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

import scipy.sparse

# Import DataFrame

In [2]:
df = pd.read_csv("~/Downloads/bigml_59c28831336c6604c800002a.csv")

# Label Encode Explanatory Data

In [3]:
le = LabelEncoder()

df['international plan'] = le.fit_transform(df['international plan'])
df['voice mail plan'] = le.fit_transform(df['voice mail plan'])
# df['area code'] = le.fit_transform(df['area code'])
# df['state'] = le.fit_transform(df['state'])

# Create Additional Columns to Better Explain the Data

In [4]:
df['account months'] = np.ceil(df['account length']/30).astype(int)
df['total minutes'] = df['total day minutes']+df['total eve minutes']+df['total night minutes']+df['total intl minutes']
df['total calls'] = df['total day calls']+df['total eve calls']+df['total night calls']+df['total intl calls']
df['total cost'] = df['total day charge']+df['total eve charge']+df['total night charge']+df['total intl charge']
df['cost per day'] = df['total cost']/df['account length']

# Separate Data into Testing/Training Sets and Fit to Random Forest

In [5]:
rfc = RandomForestClassifier(n_estimators=100,random_state=42)
X = df.drop(columns = ['churn','phone number'])
y = df['churn']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)
X_test1, X_test_final,y_test1,y_test_final = train_test_split(X_test,y_test,random_state=42,train_size=.5)
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

# Check Training Data

In [9]:
print(confusion_matrix(y_train,rfc.predict(X_train)))
print(classification_report(y_train,rfc.predict(X_train)))

[[2141    0]
 [   0  358]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00      2141
        True       1.00      1.00      1.00       358

    accuracy                           1.00      2499
   macro avg       1.00      1.00      1.00      2499
weighted avg       1.00      1.00      1.00      2499



# Check on First Set of Test Data

In [6]:
print(confusion_matrix(y_test1,rfc.predict(X_test1)))
print(classification_report(y_test1,rfc.predict(X_test1)))

[[341   0]
 [ 10  66]]
              precision    recall  f1-score   support

       False       0.97      1.00      0.99       341
        True       1.00      0.87      0.93        76

    accuracy                           0.98       417
   macro avg       0.99      0.93      0.96       417
weighted avg       0.98      0.98      0.98       417



# Coefficients 

In [7]:
sorted(list(zip(rfc.feature_importances_,X_test1.columns,)),reverse=True)

[(0.16095446947871725, 'total cost'),
 (0.12105447646453169, 'customer service calls'),
 (0.0760297229093458, 'international plan'),
 (0.07570496505678491, 'total day charge'),
 (0.07496895280985413, 'total minutes'),
 (0.06811203974080061, 'total day minutes'),
 (0.034304847892416085, 'total intl charge'),
 (0.03379182621686666, 'number vmail messages'),
 (0.031838743765609696, 'total intl minutes'),
 (0.02994052630750491, 'total eve charge'),
 (0.029784848470596183, 'total eve minutes'),
 (0.02885703475017382, 'voice mail plan'),
 (0.02865920243493262, 'total intl calls'),
 (0.025034282523753854, 'total night charge'),
 (0.02252788247226434, 'total day calls'),
 (0.021725108465508416, 'total night minutes'),
 (0.02170544395070272, 'total calls'),
 (0.021084315681280826, 'cost per day'),
 (0.020637658568022747, 'account length'),
 (0.020637447610111007, 'total night calls'),
 (0.02012095106614988, 'total eve calls'),
 (0.01924520576530958, 'state'),
 (0.008165917731806775, 'account mo

# Double Check on Second Set of Test Data

In [8]:
print(confusion_matrix(y_test_final,rfc.predict(X_test_final)))
print(classification_report(y_test_final,rfc.predict(X_test_final)))

[[368   0]
 [  7  42]]
              precision    recall  f1-score   support

       False       0.98      1.00      0.99       368
        True       1.00      0.86      0.92        49

    accuracy                           0.98       417
   macro avg       0.99      0.93      0.96       417
weighted avg       0.98      0.98      0.98       417

