# Model the data

In [None]:
import matplotlib.patches as patches
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)

from mpl_toolkits.axes_grid1 import make_axes_locatable
import numpy as np
import os
import pandas as pd
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
%matplotlib inline

path='/Users/brianna/Documents/WL_DBdeets/'

## Build a model and study the results

In [None]:
# This example is written for a generic classifier model (here, using a RandomForestClassifier)

### Classifier

In [None]:
classifierVersion = 1
features = ['NumberSignedUpFromCompany',
          'MeetingsTotalWeek0_i','MeetingsTotalWeek1_0diff',
          'PortionOfOrganizerWeek0','PortionOfOrganizerWeek1_0diff',
          'AveNumAttendeesWeek0','AveNumAttendeesWeek1_0diff',
          'completed_WL_actionWeek0','completed_WL_actionWeek1_0diff',
          'EmailCorporateVsPrivate']

prediction = 'RetentionStatus2Levels'
#prediction = 'RetentionStatus'
classifier = RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5)

In [None]:
classifierVersion = 2
features = ['MeetingsTotalWeek0_i','MeetingsTotalWeek1_0diff',
            'PortionOfOrganizerWeek0_i','PortionOfOrganizerWeek1_0diff',
            'AveNumAttendeesWeek0_i', 'AveNumAttendeesWeek1_0diff',
            'NumberSignedUpFromCompany', 
            'sharedEmail',
            'EmailCorporateVsPrivate']
#prediction = 'WeeksVisitedOutOf12'
prediction = 'RetentionStatus'
classifier = RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5)


In [None]:
classifierVersion = 3
features = ['NumberSignedUpFromCompany', 
            'added_meetingWeek0','added_meetingWeek1_0diff',
            'added_agenda_itemWeek0', 'added_agenda_itemWeek1_0diff',
            'assigned_action_itemWeek0','assigned_action_itemWeek1_0diff']
#prediction = 'WeeksVisitedOutOf12'
prediction = 'RetentionStatus2Levels'
classifier = RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5)


In [None]:
classifierVersion = 4
features = ['NumberSignedUpFromCompany',
            'added_meetingWeek0','added_meetingWeek1_0diff',
            'added_agenda_itemWeek0', 'added_agenda_itemWeek1_0diff']
#prediction = 'WeeksVisitedOutOf12'
prediction = 'RetentionStatus2Levels'
classifier = RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5)


In [None]:
classifierVersion = 5
features = ['NumberSignedUpFromCompany',
            'added_meetingWeek0',
            'added_agenda_itemWeek0']
#prediction = 'WeeksVisitedOutOf12'
prediction = 'RetentionStatus2Levels'
classifier = RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5)


# Read the data

In [None]:
#modelData = pd.read_csv(path+'06_DataFinal.csv')
modelData = pd.read_csv(path+'06_DataFinal_withLogTransform.csv')

In [None]:
# Remove all rows with null values and see how many rows you have left.
columns = features[:]
columns.append(prediction)
print('Columns remaining after rows with NA\'s removed: '+str(len(modelData.dropna(subset=columns))))
modelData=modelData.dropna(subset=columns)
modelData[columns].head()

In [None]:
# classifierVersion = 2
# surveyVersion = 1
# features = ["genderBoolean", "ageCategory", "medianHouseholdIncome", "fulltimeMedianIncome"]
# prediction = "incomeCategory"
# classifier = RandomForestClassifier(n_estimators=500, max_depth=20, min_samples_leaf=5)

In [None]:
versionLabel = "version-{version:02d}".format(version=classifierVersion)
modelLabel = "{model}".format(model=classifier.__class__.__name__)
predictionLabel = "{prediction}".format(prediction=prediction)

In [None]:
# test_size = 20% of data for testing
X_train, X_test, Y_train, Y_test = train_test_split(modelData[features], modelData[prediction].values, test_size=0.20)
train = len(X_train)/float(len(modelData))
test = len(X_test)/float(len(modelData))
print "The model data have been split into train data ({train:.2%}) and test data ({test:.2%})".format(train=train, test=test)

In [None]:
classifier.verbose = True
classifier.n_jobs = -1 # do as many jobs as there's room for on the computer
classifier.fit(X_train, Y_train)

In [None]:
# Predictions:
Y_pred = classifier.predict(X_test)

In [None]:
# Probabilities:
Y_proba = classifier.predict_proba(X_test)

### Scores

Definitions:

- *True Positives* are those which are labeled ``1`` which are actually ``1``
- *False Positives* are those which are labeled ``1`` which are actually ``0``
- *True Negatives* are those which are labeled ``0`` which are actually ``0``
- *False Negatives* are those which are labeled ``0`` which are actually ``1``


Meaning of the different scores:

$$ {\rm accuracy} \equiv \frac{\rm correct~labels}{\rm total~samples} $$

$$ {\rm precision} \equiv \frac{\rm true~positives}{\rm true~positives + false~positives} $$

$$ {\rm recall} \equiv \frac{\rm true~positives}{\rm true~positives + false~negatives} $$

$$ F_1 \equiv 2 \frac{\rm precision \cdot recall}{\rm precision + recall} $$

The **accuracy**, **precision**, **recall**, and **f1-score** all range from 0 to 1, with 1 being optimal.

In [None]:
classifier_score = classifier.score(X_test, Y_test) # same as accuracy in random forest
print classifier_score

In [None]:
classifier_accuracy_score = accuracy_score(Y_test, Y_pred)
print classifier_accuracy_score

In [None]:
classifier_precision_score = precision_score(Y_test, Y_pred, average="weighted")
print classifier_precision_score

In [None]:
classifier_recall_score = recall_score(Y_test, Y_pred, average="weighted")
print classifier_recall_score

In [None]:
classifier_f1_score = f1_score(Y_test, Y_pred, average="weighted")
print classifier_f1_score

In [None]:
# Define the labels used for the classification (without escaping dollar signs):
# classifierLabels = {}
# for (key, value) in incomeRanges.items():
#     classifierLabels[key] = r"${low:,}-${high:,}".format(low=value[0], high=value[2])

In [None]:
#classifier_classification_report = classification_report(Y_test, Y_pred, target_names=classifierLabels.values())
classifier_classification_report = classification_report(Y_test, Y_pred)
print(classifier_classification_report)

### Feature importance

In [None]:
df = pd.DataFrame(columns = ['feature','importance'], index=range(len(features)))
if isinstance(classifier, RandomForestClassifier):
    i=0
    for (feature, importance) in zip(features, classifier.feature_importances_):
        print feature, importance
        df.feature[i] = feature
        df.importance[i] = importance
        i+=1

dfsort = df.sort(columns = ['importance'], ascending = False)
dfsort.head(10)

In [None]:
# plt.bar(dfsort.feature, 
#         dfsort.importance,
#         color='b',
#         abel='Feature Importance')
plt.figure(figsize=(10,6))

ax = sns.barplot(x='feature', y='importance', data=dfsort);
ax.set(xlabel='Feature', ylabel='Importance of each Feature')

for item in ax.get_xticklabels():
    item.set_rotation(90)

In [None]:
# Define a function to plot the confusion matrix:
def plotFeatureImportance(ax, cm, title='Feature importance', cmap=plt.cm.Blues):
    # Plot the confusion matrix:
    image = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.set_title(title)
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')
    # Plot the label on the ticks:
    tick_marks = np.arange(len(classifierLabels))
    ax.set_xticks(tick_marks, list(classifierLabels.values()))
    ax.set_yticks(tick_marks, list(classifierLabels.values()))
    locations = ax.set_xticks(tick_marks)
    labels = ax.set_xticklabels(list(classifierLabels.values()))
    for label in labels:
        label.update({'rotation':90})
    locations = ax.set_yticks(tick_marks)
    labels = ax.set_yticklabels(list(classifierLabels.values()))
    # Add colorbars:
    divider = make_axes_locatable(ax)
    colorbar_ax = divider.append_axes("right", size="10%", pad=0.05)
    colorbar = ax.figure.colorbar(image, cax=colorbar_ax)

# Create the figure:
fig, axes = plt.subplots(1, 2, figsize=(15, 15))
#fig.suptitle("Model: {version}, {model}".format(version=versionLabel, model=modelLabel), fontsize=12)
for ax in axes:
    ax.set_aspect(1)

plotFeatureImportance(axes[0], cm)

# Save the figure:
prefix = "-{version}".format(version=versionLabel)
fileName = "confusion-matrices" + prefix + ".pdf"
outputDirectory = path+"model_images"
outputPath = os.path.join(outputDirectory, fileName)
fig.savefig(outputPath, bbox_inches="tight")

### Save information into a text file

In [None]:
prefix = "-{version}".format(version=versionLabel)
fileName = "model" + prefix + ".txt"
outputDirectory = path+"model_images"
outputPath = os.path.join(outputDirectory, fileName)
with open(outputPath, "w") as f:
    f.write("Version: {version}\n".format(version=versionLabel))
    f.write("Model:   {model}\n".format(model=modelLabel))
    f.write("\n")
    f.write("Classifier: {score:.2f}\n".format(score=classifier_score))
    f.write("Accuracy:   {score:.2f}\n".format(score=classifier_accuracy_score))
    f.write("Precision:  {score:.2f}\n".format(score=classifier_precision_score))
    f.write("Recall:     {score:.2f}\n".format(score=classifier_recall_score))
    f.write("F1:         {score:.2f}\n".format(score=classifier_f1_score))
    f.write("\n")
    f.write("Score {score}\n".format(score=classifier_classification_report))
    f.write("\n")
    if isinstance(classifier, RandomForestClassifier):
        f.write("Feature importance\n")
        for (feature, importance) in zip(features, classifier.feature_importances_):
            f.write("{feature}\t {importance:.2f}\n".format(feature=feature, importance=importance))
        f.write("\n")

### Confusion matrix

In [None]:
# Define the labels used for the classification (escaping the dollar signs):
classifierLabels = {}
if prediction == "RetentionStatus":
    for (key, value) in incomeRanges.items():
        classifierLabels[key] = r"\${low:,}-\${high:,}".format(low=value[0], high=value[2])
# elif prediction == "binnedIncomeCategory":
#     for (key, value) in binnedIncomeRanges.items():
#         classifierLabels[key] = r"\${low:,}-\${high:,}".format(low=value[0], high=value[2])

In [None]:
# Compute confusion matrix
cm = confusion_matrix(Y_test, Y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)

# Normalize the confusion matrix by row (i.e by the number of samples in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
print(cm_normalized)

In [None]:
# Define a function to plot the confusion matrix:
def plot_confusion_matrix(ax, cm, title='Confusion matrix', cmap=plt.cm.Blues):
    # Plot the confusion matrix:
    image = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.set_title(title)
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')
    # Plot the label on the ticks:
    tick_marks = np.arange(len(classifierLabels))
    ax.set_xticks(tick_marks, list(classifierLabels.values()))
    ax.set_yticks(tick_marks, list(classifierLabels.values()))
    locations = ax.set_xticks(tick_marks)
    labels = ax.set_xticklabels(list(classifierLabels.values()))
    for label in labels:
        label.update({'rotation':90})
    locations = ax.set_yticks(tick_marks)
    labels = ax.set_yticklabels(list(classifierLabels.values()))
    # Add colorbars:
    divider = make_axes_locatable(ax)
    colorbar_ax = divider.append_axes("right", size="10%", pad=0.05)
    colorbar = ax.figure.colorbar(image, cax=colorbar_ax)

# Create the figure:
fig, axes = plt.subplots(1, 2, figsize=(16, 5))
fig.suptitle("Model: {version}, {model}".format(version=versionLabel, model=modelLabel), fontsize=12)
for ax in axes:
    ax.set_aspect(1)

plot_confusion_matrix(axes[0], cm)
plot_confusion_matrix(axes[1], cm_normalized, title='Normalized confusion matrix')

# Save the figure:
prefix = "-{version}".format(version=versionLabel)
fileName = "confusion-matrices" + prefix + ".pdf"
outputDirectory = path+"model_images"
outputPath = os.path.join(outputDirectory, fileName)
fig.savefig(outputPath, bbox_inches="tight")