In [1]:
import numpy as np
import pandas as pd
import pandasql as psql

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import scipy.sparse

from imblearn.over_sampling import SMOTE



# Import DataFrame

In [2]:
df = pd.read_csv("~/Downloads/bigml_59c28831336c6604c800002a.csv")

In [3]:
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


# Label Encode Explanatory Data

In [4]:
le = LabelEncoder()

df['international plan'] = le.fit_transform(df['international plan'])
df['voice mail plan'] = le.fit_transform(df['voice mail plan'])
# df['area code'] = le.fit_transform(df['area code'])
# df['state'] = le.fit_transform(df['state'])

In [5]:
# ohe = OneHotEncoder()
# state = ohe.fit_transform(df[['state']])
# df = pd.concat((df,pd.DataFrame.sparse.from_spmatrix(state)),axis=1)

# Create Additional Columns to Better Explain the Data

In [6]:
# df['account months'] = np.ceil(df['account length']/30).astype(int)
df['total minutes'] = df['total day minutes']+df['total eve minutes']+df['total night minutes']+df['total intl minutes']
df['total calls'] = df['total day calls']+df['total eve calls']+df['total night calls']+df['total intl calls']
df['total cost'] = df['total day charge']+df['total eve charge']+df['total night charge']+df['total intl charge']
df['cost per day'] = df['total cost']/df['account length']

# Separate Data into Train/Test Sets and Fit to Random Forest

In [7]:
rfc = RandomForestClassifier(n_estimators=1000,random_state=42)
X = df.drop(columns = ['churn','phone number','state','area code'])
y = df['churn']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)
X_test1, X_test_final,y_test1,y_test_final = train_test_split(X_test,y_test,random_state=42,train_size=.5)
sm = SMOTE(random_state=42, sampling_strategy=.9)
# Xsmote, ysmote = sm.fit_sample(X_train, y_train)
# rfc.fit(Xsmote,ysmote)
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

# Check Training Data

In [8]:
print(confusion_matrix(y_train,rfc.predict(X_train)))
print(classification_report(y_train,rfc.predict(X_train)))

[[2141    0]
 [   0  358]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00      2141
        True       1.00      1.00      1.00       358

    accuracy                           1.00      2499
   macro avg       1.00      1.00      1.00      2499
weighted avg       1.00      1.00      1.00      2499



# Check on First Set of Test Data

In [9]:
print(confusion_matrix(y_test1,rfc.predict(X_test1)))
print(classification_report(y_test1,rfc.predict(X_test1)))

[[341   0]
 [  9  67]]
              precision    recall  f1-score   support

       False       0.97      1.00      0.99       341
        True       1.00      0.88      0.94        76

    accuracy                           0.98       417
   macro avg       0.99      0.94      0.96       417
weighted avg       0.98      0.98      0.98       417



# Coefficients 

In [10]:
sorted(list(zip(rfc.feature_importances_,X_test1.columns,)),reverse=True)

[(0.1823029422615225, 'total cost'),
 (0.12979155975569523, 'customer service calls'),
 (0.0774492818803392, 'international plan'),
 (0.06928989729363125, 'total day minutes'),
 (0.06675452101209381, 'total day charge'),
 (0.06661601213112456, 'total minutes'),
 (0.03883189671118281, 'total intl calls'),
 (0.038433262932340846, 'number vmail messages'),
 (0.035254975172740295, 'voice mail plan'),
 (0.03379591353488788, 'total intl charge'),
 (0.032956530517242406, 'total intl minutes'),
 (0.029171655411529464, 'total eve minutes'),
 (0.02879533196262102, 'total eve charge'),
 (0.023165180624009567, 'total night charge'),
 (0.022540116271725537, 'cost per day'),
 (0.022178658697108583, 'total night minutes'),
 (0.021662157328198966, 'total day calls'),
 (0.02053262119178274, 'total night calls'),
 (0.02046728175600491, 'total calls'),
 (0.02005823474491011, 'account length'),
 (0.01995196880930837, 'total eve calls')]

# Double Check on Second Set of Test Data

In [11]:
print(confusion_matrix(y_test_final,rfc.predict(X_test_final)))
print(classification_report(y_test_final,rfc.predict(X_test_final)))

[[368   0]
 [  8  41]]
              precision    recall  f1-score   support

       False       0.98      1.00      0.99       368
        True       1.00      0.84      0.91        49

    accuracy                           0.98       417
   macro avg       0.99      0.92      0.95       417
weighted avg       0.98      0.98      0.98       417



In [12]:
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn,total minutes,total calls,total cost,cost per day
0,KS,128,415,382-4657,0,1,25,265.1,110,45.07,...,11.01,10.0,3,2.7,1,False,717.2,303,75.56,0.590313
1,OH,107,415,371-7191,0,1,26,161.6,123,27.47,...,11.45,13.7,3,3.7,1,False,625.2,332,59.24,0.553645
2,NJ,137,415,358-1921,0,0,0,243.4,114,41.38,...,7.32,12.2,5,3.29,0,False,539.4,333,62.29,0.454672
3,OH,84,408,375-9999,1,0,0,299.4,71,50.9,...,8.86,6.6,7,1.78,2,False,564.8,255,66.8,0.795238
4,OK,75,415,330-6626,1,0,0,166.7,113,28.34,...,8.41,10.1,3,2.73,3,False,512.0,359,52.09,0.694533
