In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier as RFC
from mechlearn import split_and_scale as ss
from mechlearn import roc

In [None]:
X = pd.read_csv('../Resources/X.csv')
y = pd.read_csv('../Resources/y.csv').to_numpy().ravel()
features = X.columns

In [None]:
features

In [None]:
len(features)

## Results without any Dimensionality Reduciton
To start we consider the accuracy area under the ROC of the random forest classifier model trained and tested on the full dataset. This will serve as a reference for how subsequent models perform. 

In [None]:
_Xs, Xs_, _y, y_ = ss(X, y)
rfc_1 = RFC().fit(_Xs, _y)
importances = rfc_1.feature_importances_
acc_1 = rfc_1.score(Xs_, y_)
print(f'Random Forest (Using Whole Dataset)\nAccuracy: {round(acc_1,4)*100}%')

Using the 'feature_importances_' of the RFC model, we can see which features explain the most signal-to-noise in the output and therefore are most useful (or important) to the model.

In [None]:
ticks = np.arange(len(features))
fig, ax = plt.subplots()
ax.barh(ticks, importances)
ax.set_yticks(ticks, labels=features)
fig.set_size_inches(15, 15)
plt.show()

Somewhat unsurprisingly, features that represent the customer's use of the credit, like 'Total_Trans_Ct' and 'Total_Trans_Amt' ahve the highest importance to the model. Conversly, demographic features like 'Customer_Age', 'Education_Level' and 'Income_Category' show lower importance to the model. While this could be partially explained by saying that customer use is a better indicator of attrition, it's also important to consider two things 
1. Many of the demographic features are encoded meaning each an original feature like 'Gender' is represented by mulptle features in the data, 'Gender_M' and 'Gender_F'. Becuase of this, the total feature importance could be higher than what is shown for each individual encoded compoenent feature.
1. 'Education_Level', 'Marital_Status' and 'Income_Category', all of which are demographics contain 'Unknown' values. 

## 1. Using Only Demographics

In [None]:
featuresr = [features[i] for i in np.r_[0:2, 14:33]] # just demographics columns like age, education_level etc.
Xr = X.iloc[:, np.r_[0:2, 14:33]]

In [None]:
_Xrs, Xrs_, _y, y_ = ss(Xr, y)
rfc_2 = RFC().fit(_Xrs, _y)
rfc_2.score(Xrs_, y_)

In [None]:
len(featuresr)

In [None]:
Xr.to_csv('../Resources/X-dem.csv', index=False)

## 2. Using PCA

In [None]:
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler as SS
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
_Xd, Xd_, _yd, yd_ = tts(X, y)
ssd = SS()
_Xds, Xds_ = ssd.fit_transform(_Xd), ssd.transform(Xd_)
pca = PCA(n_components=21)
_Xdsp, Xdsp_ = pca.fit_transform(_Xds), pca.transform(Xds_)
pca.explained_variance_ratio_

In [None]:
len(pca.components_)

In [None]:
rfc_3 = RFC()
rfc_3.fit(_Xdsp, _yd)
rfc_3.score(Xdsp_, y_)

In [None]:
Xdsp.to_csv('../Resources/X-pca.csv', index=False)

In [None]:
rfc_1.get_params()

In [None]:
def unit_vector(dimension, number_of_dimensions):
    v1 = [0 for _ in range(dimension)]
    v2 = [0 for _ in range(number_of_dimensions-dimension-1)]
    return (v1 + [1] + v2),

print(len(unit_vector(5, 10)[0]))
print(unit_vector(5,10)[0][5])
print(unit_vector(5,10))

In [None]:
T = np.array([pca.inverse_transform(unit_vector(comeponent, 23))[0] for comeponent in range(23)])
print(T)

In [None]:
T.shape