In [None]:
# import the libraries
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pandas as pd



########## LOAD DATA SET AND PREP
# load the data set
import os

dataFile = os.path.normpath("C:/Users/n846490/Documents/Python Scripts/SurvivalAnalysis/ChurnModelFiles/CleanAttritionDataForML.csv")
dataset = pd.read_csv(dataFile, index_col=0)

# create the X and y values
X = dataset.iloc[:, 0:26].values
y = dataset.iloc[:, 26].values

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Fitting Random Forest Classification to the Training set
# max number of trees = 100
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
# calculate accuracy
acc = cm[0,0] + cm[1,1]
miss = cm[0,1] + cm[1,0]

totAcc = acc/(miss + acc)
totAcc

In [None]:
dumb = cm[0,0]/(miss + acc)
dumb

In [None]:
# get the variable importance
importances = classifier.feature_importances_
indices = np.argsort(importances)
features = dataset.columns[0:26]

In [None]:
plt.figure(1)

plt.figure(figsize = (10,8))

plt.title('Feature Importances', size = 20)
plt.barh(range(len(indices)), importances[indices], color='skyblue', align='center')
plt.yticks(range(len(indices)), features[indices], size = 14)
plt.xticks(size = 14)
plt.xlabel('Relative Importance', fontsize = 14)

In [None]:
# get the roc curve and auc
from sklearn import metrics
preds = classifier.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
roc_auc

# plot the curve from the data frame
rocDf = pd.DataFrame(dict(fpr=fpr, tpr= tpr))
rocDf.shape[0]


plt.figure(figsize = (10,8))
plt.title('Receiver Operating Characteristic', fontsize = 20)
plt.plot(rocDf.fpr, rocDf.tpr, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right', fontsize = 12)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate', fontsize = 14)
plt.xlabel('False Positive Rate', fontsize = 14)
plt.xticks(size = 12)
plt.yticks(size = 12)
plt.show()

In [None]:
# get some tree information
from sklearn.tree import export_graphviz
import pydotplus

os.getcwd() 
os.chdir(os.path.normpath('C:/Users/n846490/Documents/DataScience/OutputsMapsEtc'))

In [None]:
# check the key variable

dataset.groupby('CheckingOpen').agg({'LeftBank' : pd.Series.count})

In [None]:
dataset['CheckingOpen'].describe()