# Classification model
## Terms
features = columns of variables used to predict the target<br>
X = features (DataFrame, 1+ columns)<br>
y = target/predicted (Series, 1 column)<br>
X_train, y_train = X and y retailers/rows used for training<br>
X_test, y_test = X and y retailers/rows used for testing<br>
X_train = training predictors<br>


## Core pipeline
### Import data -> undersample -> rescale -> cross-validate -> evaluate

In [None]:
# import libraries
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

## Import data

In [None]:
# read the reshaped dataset from 'Reshaping.ipynb'
df = pd.read_csv('Reshaped_dataset.csv', header = 0)

In [None]:
# check variables
df.head()

In [None]:
# descriptives of non-string variables
df.describe()

In [None]:
df.shape

In [None]:
df_ready = df.set_index('CustomAttribute1')

In [None]:
# split features from target for the train-test-split
X = df_ready.drop(columns = ['anyRevenue_trial'])
y = df_ready['anyRevenue_future']

## Do train-test-split

In [None]:
# train-test-split, random_state = 42 because it's the meaning of life
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 42, stratify = y)

## Rescale features (min = 0, max = 1)

In [None]:
# store original test data as a baseline for later comparison
baseline_test = X_test.copy()
baseline_test['anyRevenue_future'] = y_test
baseline_test = baseline_test.reset_index()

In [None]:
# remove future variables from list of features and rescale between 0 and 1
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = X_train.drop(columns = ['anyRevenue_future','Revenue_future'])
X_train[:] = scaler.fit_transform(X_train)
X_test = X_test.drop(columns = ['anyRevenue_future','Revenue_future'])
X_test[:] = scaler.fit_transform(X_test)

# Hyperparameter tuning for Random Forest

Calculating the best hyperparameter based only on the training set

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

RSEED = 42 # To keep same randomness as before

# Hyperparameter grid
param_grid = { 
    'n_estimators': [200, 500, 1000],
    'max_features': np.arange(1, 11, 1),
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy'],
    'bootstrap': [True, False]
}

# Estimator for use in random search
estimator = RandomForestClassifier(random_state = RSEED)

# Create the random search model
rfCV = RandomizedSearchCV(estimator, param_grid, n_jobs = -1, 
                        scoring = 'accuracy', cv = 4, 
                        n_iter = 10, verbose = 1, random_state=RSEED)

# Fit model
rfCV.fit(X_train, y_train);

clf = rfCV.best_estimator_

In [None]:
clf.score(X_test,y_test)

## Display feature importance

Display importance of features calculated according to Random Forest 

In [None]:
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
fig=plt.figure(figsize=(12,5))
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices], color="r")
plt.xticks(range(X.shape[1]-1), X.columns[indices], rotation = 60, fontsize=12)
plt.xlim([-1, int(X.shape[1])-1])
plt.yticks(np.arange(0, (max(importances)+0.4), step = 0.1))
plt.ylim(-0.04,0.2)
plt.show()

## Results

In [None]:
# CONFUSION MATRIX on test data

from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)
confusion_matrix = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,7))
sns.heatmap(confusion_matrix, annot=True, cmap="Reds", cbar = False, fmt='g')
sns.set(font_scale=3)

# passing a list is fine, no need to convert to tuples
ax.set_xticklabels(['None','Some'])
ax.set_yticklabels(['None','Some'])
plt.xlabel("Predicted", size = 30)
plt.ylabel("Actual", size = 30)
print(confusion_matrix)

In [None]:
# print precision and recall scores etc.
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
# perform cross-validation over 4 different combinations of training data
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics

scores = cross_val_score(clf, X_train, y_train, cv = 4)

In [None]:
# print scores of each cross-validation
print(scores)

## ROC curve

In [None]:
# calculate ROC curve
from sklearn import preprocessing

y_testBool = preprocessing.LabelEncoder().fit_transform(y_test)
y_predBool = preprocessing.LabelEncoder().fit_transform(y_pred)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

sns.set(font_scale=1)
logit_roc_auc = roc_auc_score(y_testBool, y_predBool)
fpr, tpr, thresholds = roc_curve(y_testBool, clf.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

## Plots of model impact on proportion of profitable retailers and average revenue

### Baseline proportion of profitable retailers

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
sizesB = baseline_test.groupby('anyRevenue_future').CustomAttribute1.nunique().tolist()
labelsB = ['Unprofitable','Profitable']
explode = (0, 0.1)  # only "explode" the 2nd slice
baseline_test.shape

In [None]:
fig1, ax1 = plt.subplots(figsize=(8,7))
ax1.pie(sizesB, explode=explode, labels=labelsB, autopct='%1.0f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

### Proportion of profitable retailers after applying model

In [None]:
test_filtered = baseline_test.copy()
test_filtered['predicted'] = y_pred
test_filtered = test_filtered[test_filtered['predicted'] == 'some']
test_filtered.reset_index(level=0, inplace=True)

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
sizes = test_filtered.groupby('anyRevenue_future').CustomAttribute1.nunique().tolist()
labels = ['Unprofitable','Profitable']
explode = (0, 0.1)  # only "explode" the 2nd slice
fig2, ax1 = plt.subplots(figsize=(8,7))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.0f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

## Plot avg profitability x retailer

In [None]:
# Extract data:

baselineRevenue = baseline_test['Revenue_future'].mean()
modelRevenue = test_filtered['Revenue_future'].mean()
height = [baselineRevenue,modelRevenue]
bars = ('Baseline', 'Model')
y_pos = np.arange(len(bars))

# create plot

fig, ax = plt.subplots(nrows=1, ncols=1,
                      figsize=(8,7))

# Create bars

plt.bar(y_pos, height, color = ['b','r'], edgecolor='black')

ax.set_facecolor('white')
ax.spines['right'].set_visible(1)
ax.spines['top'].set_visible(1)

# Create names on the x-axis

plt.xticks(y_pos, bars)
ax.tick_params(axis="y", direction="out", which="both", right=False, left=True)

plt.ylabel('£ per retailer')
plt.yticks(np.arange(100, 1800, step=200))

ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')

# plot values

for i, v in enumerate(height):
    plt.text(y_pos[i] - 0.14 -(i*0.04), v + 50, '£'+str(int(v)))

# Show graphic

plt.show()