### LSE Data Analytics Online Career Accelerator 
# Course 301: Advanced Analytics for Organisational Impact

This Jupyter Notebook is based on a video demonstration by your course convenor, Dr James Abdey, to learn creating random forests to predict the likelihood of a new customer ‘churning’ or leaving a specific service provider.

# 1. Prepare your workstation

In [None]:
#  Import all the necessary packages.
import pandas as pd
import numpy as np
import statsmodels.api as sm 
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report 
from sklearn import svm 

# Read the data file with Pandas.
df = pd.read_csv('customer_data.csv')  

# Sense-check the data.
print(df.info())
df.head()

# 2. Prepare the data

In [None]:
# Update all the details of the education column.
df['Edu'][df['Edu'].str.contains('basic') ] = 'pre-school'
df['Edu'][df['Edu'].str.contains('university') ] = 'uni'
df['Edu'][df['Edu'].str.contains('high') ] = 'high-school'
df['Edu'][df['Edu'].str.contains('professional') ] = 'masters'
df['Edu'][df['Edu'].str.contains('illiterate') ] = 'other'
df['Edu'][df['Edu'].str.contains('unknown') ] = 'other'

df['Edu'].unique()

# 3. Create dummy variables

In [None]:
# Name new DataFrame and convert categorical variables to dummy variables:
cat_vars=['Occupation', 'Status', 'Edu', 'House', 'Loan',
          'Comm', 'Month', 'DOW', 'Last_out']

# Use the for loop keyword to specify what actions to apply to all the var items:
# Specify what needs to apply to all the variables.
for var in cat_vars: 
    
    # This line of code is not needed if you use the second cat_list.
    # This code was only used for explanation purposes in the video.
    # cat_list='var'+'_'+var
    
    # Specify details of the categorical list.
    cat_list = pd.get_dummies(df[var], prefix=var)  
    # Indicate the joining of the DataFrames.
    df1=df.join(cat_list)  
    # Set old DataFrame with new df with dummy values.
    df=df1  
    
    # This is a duplicate to the first line of code.
    # This code was only used for explanation purposes in the video.
    # cat_vars=['Occupation', 'Status', 'Edu', 'House', 'Loan',
    #           'Comm', 'Month', 'DOW', 'Last_out']

# Set a temporary DataFrame and add values.
df_vars=df.columns.values.tolist()  

# Indicate what columns are kept.
to_keep=[i for i in df_vars if i not in cat_vars] 

# Define new DataFrame.
df_fin=df[to_keep]

# Print the column.
df_fin.columns.values  

# 4. Balance the data

In [None]:
# Apply SMOTE as the target variable is not balanced.
df_fin = df_fin.fillna(0)

# Select only the necessary columns and variables: 
nec_cols = ['Status_divorced', 'Status_married',
            'Status_single', 'Status_unknown', 
            'Edu_high-school', 'Edu_masters', 
            'Edu_other', 'Edu_pre-school', 
            'Edu_uni', 'House_no', 'House_unknown',
            'House_yes', 'Loan_no', 'Loan_unknown', 
            'Loan_yes', 'DOW_fri', 'DOW_mon']

# Set the independent variables.
X = df_fin[nec_cols]  
# Set the dependent variable.
y = df_fin.loc[:, df_fin.columns == 'Target']  

# Create a new DataFrame and [4a] apply SMOTE as the target variable is not balanced.
os = SMOTE(random_state=0)  
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=0)

# Specify column values. 
columns = X_train.columns  

# Perform oversampling.
# Specify the new data sets. 
os_data_X,os_data_y = os.fit_resample(X_train, y_train)    

# Create two DataFrames for X and one for y from oversampling:
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['Target'])

# Print the DataFrame.
print("length of oversampled data is ",len(os_data_X))  
os_data_y

In [None]:
# Determine if values are balanced.
os_data_y['Target'].value_counts()  

# 5. Build and fit the model

In [None]:
# Import the RandomForestClassifier class.
from sklearn.ensemble import RandomForestClassifier  

# Create a forest object based on the RandomForestClassifier:
forest = RandomForestClassifier(n_estimators=200, criterion='gini', 
                                min_samples_split=2, min_samples_leaf=2, 
                                max_features='auto', bootstrap=True, n_jobs=-1, 
                                random_state=42)


# Train and predict the model:
forest.fit(X_train, y_train)  
y_pred = forest.predict(X_test)

# Import scikit-learn metrics module for accuracy calculation.
from sklearn import metrics

# Model accuracy, how often is the model correct?
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

# 6. Visualise the model

In [None]:
# Import the necessary packages:
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.tree import export_graphviz

# Plot the decision tree to create the visualisation:
fig, axes = plt.subplots(nrows = 1,
                         ncols = 1,
                         figsize = (4,4),
                         dpi=800)

tree.plot_tree(forest.estimators_[0],
               filled = True);

# Print and save the plot.
fig.savefig('rf_individualtree.png')  

# 

# Create a confusion matrix

# Accuracy of model

# Determine feature significance