## Goldsmiths University of London
**Author....: Carlos Manuel de Oliveira Alves**<br>
**Student..: cdeol003**<br>
**Created..: 10/10/2022**

In [1]:
# Import all necessary libraries that we will use in this project
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Download the dataset for the project
data = data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv'

!wget $data

In [None]:
# Print the column names and the first 9 records of the dataset
!head CreditScoring.csv

In [None]:
# Print the data now using Pandas
df = pd.read_csv(data)

In [None]:
# Print the first 5 rows of our daraset
df.head()

In [None]:
# Preparing the data:

# Use the column status we want to translate the numbers into strings
# with text is easier to understand, what this status means

In [None]:
# Lower case all the column names of the dataset
df.columns = df.columns.str.lower()
df.head()

In [None]:
# Check the column status and how many values we have
df.status.value_counts()
# we can see we have a variable zero

In [None]:
# Convert the column status to string using the map function
# this function map converts to a dictionary every value from 
# the original data frame to some new value
df.status.map({1: 'ok', 2: 'default', 0: 'unk'}) # unk means unknown

In [None]:
# Check our dataframe after used the map function
df.head()

In [None]:
# Now dealing with values: home, marital, records and job
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

In [None]:
# Check our data frame after used the map function
df.head()

In [None]:
# Use the method describe that describes the dataset like all the statistics
df.describe()

In [None]:
# Use the method describe with the random function
df.describe().round()
# below we can see income, assets and debt have large numbers as max values
# this max values means the value is missing

In [None]:
# Using another way similar to map function to replace values
df.income.replace(to_replace=99999999, value=np.nan)

In [None]:
# Create a for loop to update the max value with NaN
for c in ['income', 'assets','debt']:

  # After access the column c we want to replace the value with NaN
  df [c] = df[c].replace(to_replace=99999999, value=np.nan)

In [None]:
# Check the max value is gone from the dataframe
df.describe().round() 
# we can see below we don't have the max value anymore

In [None]:
# Look at the status variable. We have zeros meaning we have missing values
df.status.value_counts()
# below the status zero means unk and we want to remove it

In [None]:
# Filter all records with status, not zero, from the data frame
df[df.status != 0]

In [None]:
# Reset the index of the data frame
df = df[df.status != 0].reset_index()
df

In [None]:
# Doing the train, validation and test split:

# Import library sklearn the package train test split
from sklearn.model_selection import train_test_split

# Split the data into full train and test dataframe
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)

# Split the data into train and validation dataframe
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)

In [None]:
# Reset the index of the train, validation and test dataframes
df_train = df_train.reset_index(drop=True)
df_val   = df_val.reset_index(drop=True)
df_test  = df_test.reset_index(drop=True)

In [None]:
# Turn the status to a number 1 = true and 0 = false and store in y_train variable
y_train = (df_train.status == 2).astype('int').values
y_train
# Do the same to validation and test dataframes
y_val  = (df_val.status == 2).astype('int').values
y_test = (df_test.status == 2).astype('int').values

In [None]:
# Remove the default variables from train, validation and test dataframes
del df_train['status']
del df_val['status']
del df_test['status']

In [None]:
df_train

In [None]:
# Applying Decision Trees:

# Create asses risk function
def assess_risk(client):

  # Check the customer has a record
  if client['records'] == 'yes':

    # Check the customer job is part-time
    if client['job'] == 'parttime':
      return 'default'
    else:
      return 'ok'
  else:
    # Check the customer have more than 6k in assets
    if client['assets'] > 6000:
      return 'ok'
    else:
      return 'default'

In [None]:
# Convert the first record from the train dataframe into a dictionary
xi = df_train.iloc[0].to_dict()
xi

In [None]:
# Use the assess risk function with our customer
assess_risk(xi)

In [None]:
# Now we can train using the rules created above
# so we can learn these rules from the data using the decision tree algorithm
# and for that we are using the sklearn

# Import sklearn with the package Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Use dictionary vectorizer because we have categorical variables
from sklearn.feature_extraction import DictVectorizer

In [None]:
# Import sklearn with the ROC AUC score
from sklearn.metrics import roc_auc_score

In [None]:
# Import the library warnings to ignore the warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Turn our train dataframe into a list of dictionaries
# and then turn the list of dictionaries into the feature matrix
# and then after we will train the model
# Use the fillna function to fill the NaN with zeros
train_dicts = df_train.fillna(0).to_dict(orient='records')

In [None]:
# Create a new instance of the DictVectorizer class without sparse
dv = DictVectorizer(sparse=False)

# Use the function transform with our DictVectorizer
X_train = dv.fit_transform(train_dicts)

In [None]:
# Look and check at the feature names
dv.get_feature_names()

In [None]:
# Train our decision tree
dt = DecisionTreeClassifier()

# For training the model we use the fit method
dt.fit(X_train, y_train)

In [None]:
# Test our model:

# Use the validation dataframe to transform to a list of dictionaries
val_dicts = df_val.fillna(0).to_dict(orient='records')

# Use the function transform with our DictVectorizer
X_val = dv.transform(val_dicts)

In [None]:
# Apply our model on X validation and use the first column
y_pred = dt.predict_proba(X_val)[:, 1]

In [None]:
# Compute the ROC AUC score
roc_auc_score(y_val, y_pred)

In [None]:
# Let's see the ROC AUC score for our train dataframe:

# Apply our model on X train and use the first column
y_pred = dt.predict_proba(X_train)[:, 1]

# Compute the ROC AUC score
roc_auc_score(y_train, y_pred)
# the score below is 100%, and it's excellent but 
# the model learned from the data, and we don't want this

In [None]:
# Retrain the tree so the model doesn't learn from the data
# and for that we control the depth of the tree

# Train our decision tree with max depth of 3 levels
dt = DecisionTreeClassifier(max_depth=3)

# For training the model we use the fit method
dt.fit(X_train, y_train)

In [None]:
# Apply our model on X train and use the first column
y_pred = dt.predict_proba(X_train)[:, 1]

# Compute the ROC AUC score
auc = roc_auc_score(y_train, y_pred)

# Print the result of the ROC AUC score for the train dataframe
print('Train:', auc)

# Apply our model on X train and use the first column
y_pred = dt.predict_proba(X_val)[:, 1]

# Compute the ROC AUC score
auc = roc_auc_score(y_val, y_pred)

# Print the result of the ROC AUC score for the validation dataframe
print('Validation:', auc)

# below, the performance of our model on the validation dataframe is significantly better
# it's 73% compared to 65%

In [None]:
# Use sklearn tree that helps to visualise trees
from sklearn.tree import export_text

In [None]:
# Print the train decision tree
print(export_text(dt))

In [None]:
# Print the train decision tree with feature names
print(export_text(dt, feature_names=dv.get_feature_names()))

In [None]:
# Decision Tree Learning Algorithm:

# Use simple data for decision tree learning algorithm
data = [
    [8000, 'default'],
    [2000, 'default'],
    [   0, 'default'],
    [5000, 'ok'],
    [5000, 'ok'],
    [4000, 'ok'],
    [9000, 'ok'],
    [3000, 'default'],
]

In [None]:
# Convert the data to a dataframe example
df_example = pd.DataFrame(data)
df_example

In [None]:
# Define the column names of the example dataframe
df_example = pd.DataFrame(data, columns=['assets','status'])
df_example

In [None]:
# Use the numerical column assets to train our decision tree
# using the assets column 

# Sort the dataframe example by assets
df_example.sort_values('assets')

# below we can see the assets values and we want to come up
# with a rule assets greater than threshold and the possibilities
# are split the dataframe with left part T = 2000 T = 3000 and then cut the dataframe
# and then the right part with T = 4000 T = 5000 and T = 8000
# note: we don't need assets with zero because is the first node
# and we don't need the last node with 9000
# so the potential thresholds are: 2000, 3000, 4000, 5000 and 8000

In [None]:
# Create a list with the potential thresholds
Ts = [0, 2000, 3000, 4000, 5000, 8000]

In [None]:
# Import library display from Python
from IPython.display import display

In [None]:
# now we want to cut our dataframe example in the left and right side
# and see which split is the best one

# Create a for loop to use our list of thresholds
for T in Ts:

  # Print the threshold value
  print(T)

  # Split the data if the assets are equal or less greater than the threshold
  df_left = df_example[df_example.assets <= T]

  # Split the data if the assets are equal or greater than the threshold
  df_right = df_example[df_example.assets > T]

  # Print data from the left and right of the dataframe example
  display(df_left)
  display(df_right)

  # Print blank line
  print()

In [None]:
# Now we have many splits, and we want to know which split
# is the best for our evaluating because we have different evaluation criteria
# or split evaluation criteria 

# Let's have an example using the threshold with 4000
T = 4000

# Split the data if the assets are equal or less greater than the threshold
df_left = df_example[df_example.assets <= T]

# Split the data if the assets are equal or greater than the threshold
df_right = df_example[df_example.assets > T]

# Print data from the left and right of the dataframe example
display(df_left)

# Use the function value counts with status and normalize set to true to return ratios
# instead of the absolute number
print(df_left.status.value_counts(normalize=True))

# Use the function value counts with status and normalize to the right side now
display(df_right)
print(df_right.status.value_counts(normalize=True))

In [None]:
# To understand how good our predictions are here, we can look at
# misclassification rate and is when we predict everyone as default
# how many errors we make, so what is the fraction of errors here 
# we can see above start from the top we have 3 default and 1 ok
# so we predict everyone as default and we'll make one error out of 4
# because we have 4 assets and our misclassification rate is 25%
# and this will be our left. 
# On the right we have 3 ok and one default so then we predict everyone is okay
# and we have missed classification rate of 25% 
# doing this we can evaluate the quality of our split 
# so for T = 4000 we get 25 error rate misclassification rate
# on the left 25% and on the right 25%, and then we can take an average
# and the average is 25%, and we can say if we use this T = 4000 as the split, then
# the average misclassification rate is 25% 
# we don't have to take coverage so that we can take a weighted average, for example
# we have one example on the left and eight examples on the right, and it makes more
# it makes sense to take a weighted average so that we will use the one example of more weight
# but for simplicity, we will use the usual average
# in real life, the way these algorithms work is they use a weighted average

In [None]:
# Now let's do the same but now inside of for loop:

# Create a for loop to use with the list of thresholds
for T in Ts:

  # Print the threshold value
  print(T)

  # Split the data if the assets are equal or less greater than the threshold
  df_left = df_example[df_example.assets <= T]

  # Split the data if the assets are equal or greater than the threshold
  df_right = df_example[df_example.assets > T]

  # Print data from the left and right of the dataframe example
  display(df_left)

  # Use the function value counts with status and normalize set to true to return ratios
  # instead of the absolute number
  print(df_left.status.value_counts(normalize=True))

  # Use the function value counts with status and normalize to the right side now
  display(df_right)
  print(df_right.status.value_counts(normalize=True))

  # Print blank line
  print()

In [None]:
# Use simple data for decision tree learning algorithm

# Add a second column feature called debt to the dataset
data = [
    [8000, 3000, 'default'],
    [2000, 1000, 'default'],
    [   0, 1000, 'default'],
    [5000, 1000, 'ok'],
    [5000, 1000, 'ok'],
    [4000, 1000, 'ok'],
    [9000,  500, 'ok'],
    [3000, 2000, 'default'],
]

# Define the column names of the example dataframe
df_example = pd.DataFrame(data, columns=['assets', 'debt', 'status'])
df_example

In [None]:
# Sort the values by debt with the dataframe example
df_example.sort_values('debt')

In [None]:
# Look at the table above. We can see we can split the dataset by:
# 500, 1000 and 2000 (we look at the values with status ok)

# Create a dictionary with values collected from assets and debt
thresholds = {
    'assets': [0, 2000, 3000, 4000, 5000, 8000],
    'debt': [500, 1000, 2000]
}

In [None]:
# Iterate over each of the customers for feature and threshold items
for feature, Ts in thresholds.items():

  # Print the feature name of the dataframe
  print('#######################')
  print(feature)
  
  # Iterate over the list of thresholds
  for T in Ts:

    # Now instead of doing with assets we do for all features
  
    # Print the threshold value
    print(T)

    # Split the data if the assets are equal or less greater than the threshold
    # df_left = df_example[df_example.assets <= T]
    df_left = df_example[df_example[feature] <= T]

    # Split the data if the assets are equal or greater than the threshold
    # df_right = df_example[df_example.assets > T]
    df_right = df_example[df_example[feature] > T]

    # Print data from the left and right of the dataframe example
    display(df_left)

    # Use the function value counts with status and normalize set to true to return ratios
    # instead of the absolute number
    print(df_left.status.value_counts(normalize=True))

    # Use the function value counts with status and normalize to the right side now
    display(df_right)
    print(df_right.status.value_counts(normalize=True))

    # Print blank line
    print()

# Print a separate line for better visualization of the data
print('#######################')

In [None]:
# Decision Tree Learning Algorithm:
# . Find the best split
# . Stop IF Max_Depth is reached
# . IF LEFT is sufficiently large
#        AND NOT pure:
#.         REPEAT FOR LEFT
# . IF RIGHT is sufficiently large
#        AND NOT pure:
#.         REPEAT FOR RIGHT

In [None]:
# Decision Tree Parameter Tunning:
# . Selecting max_depth
# . Selecting min_samples_leaf

In [None]:
# Iterate over different values of max depth
# None means no restriction and it should grow tree as deep as possible
# as many layers as possible 
for d in [1, 2, 3, 4, 5, 6, 10, 15, 20 , None]:

  # Train a decision tree by adjusting the max depth parameter
  dt = DecisionTreeClassifier()

  # For training the model we use the fit method
  dt.fit(X_train, y_train)

  # Apply our model on X validation and use the first column
  y_pred = dt.predict_proba(X_val)[:, 1]

  # Compute the ROC AUC score
  auc = roc_auc_score(y_val, y_pred)

  # Print the depth and the AUC rounded to 3 digits
  # 4s means 4 characters and it keeps it aligned
  print('%4s -> %.3f' % (d, auc))

  # below we can see the best values are 4, 5 and 6 with 76%
  # we can say this three are good ones so our three should have 
  # the depth from 4 to 6 layers

In [None]:
# now we know what are the best depth parameters (4, 5 and 6)
# we can do now is for each of these depth values we can try
# different mean sample leaf values and see what happens

# Declare list scores to store the results
scores = []

# Crete a for loop for our best depth parameters (4, 5 and 6)
for d in [4, 5, 6]:

  # Iterate over different values for the other parameter the mean sample leaf
  for s in [1, 2, 5, 10, 15, 20, 100, 200, 500]:

    # Train a decision tree by adjusting the max depth and min samples leaf parameters
    dt = DecisionTreeClassifier(max_depth=d, min_samples_leaf=s)

    # For training the model we use the fit method
    dt.fit(X_train, y_train)

    # Apply our model on X validation and use the first column
    y_pred = dt.predict_proba(X_val)[:, 1]

    # Compute the ROC AUC score
    auc = roc_auc_score(y_val, y_pred)

    # Print the max depth, min sample leaf and the AUC rounded to 3 digits
    # 4s means 4 characters and it keeps it aligned
    print('(%4s, %3d) -> %.3f' % (d, s, auc))
    # Change the print to append the list of scores
    scores.append((d, s, auc))

# Create dataframe scores to store the scores list
# Declare the names of the columns of the dataframe scores
columns=['max_depth', 'min_samples_leaf', 'auc']
df_scores = pd.DataFrame(scores, columns=columns)

In [None]:
# Print the first 5 rows of data from the dataframe scores
df_scores.head()

In [None]:
# Print the values of AUC by descending order
df_scores.sort_values(by='auc', ascending=False).head()

# we can see below the depth of 6 seems to be the best one
# when we put a limit on the size of the leaf

In [None]:
# Turn the dataframe scores into a dataframe where omn the rows we have
# min_samples_leaf and columns we have max_depth and the cells will be AUC
# Use function pivot with dataframe scores and create a dataframe scores pivot
# index is the row in our case is min_samples_leaf
df_scores_pivot = df_scores.pivot(index='min_samples_leaf', columns=['max_depth'], values=['auc'])
df_scores_pivot.round(3)

# we can see the min_sample_leaf 15 is the largest with 78%

In [None]:
# Visualise the dataframe scores pivot as a heat map 
# use annnot to visualize the values
sns.heatmap(df_scores_pivot, annot=True, fmt='.3f')

# to check the worst values we look at the darkest background color in this plot is 68%
# to check the best value we look at the light background colot in this plot is 78.1%
# we see at the right side of the plot values start with 78% and ends with 68%
# we can see with the score 78.6% has the max dept of 6 with the AUC and 15 with
# min_sample_leaf

In [None]:
# Train our decision tree by set the parameters max_depth and min_samples_leaf
dt = DecisionTreeClassifier(max_depth=6, min_samples_leaf=15)

# For training the model we use the fit method
dt.fit(X_train, y_train)

In [None]:
# Import the package RandomForestClassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Create RandomForestClassifier with 10 models
# n_estimators are the numbers of models we want to use
rf = RandomForestClassifier(n_estimators=10, random_state=1)

# For training the model we use the fit method
rf.fit(X_train, y_train)

In [None]:
# Apply our model on X validation and use the first column
y_pred = rf.predict_proba(X_val)[:, 1]

In [None]:
# Compute the ROC AUC score
roc_auc_score(y_val, y_pred)

In [None]:
rf.predict_log_proba(X_val[[0]])

In [None]:
# Iterate over many different values from 10 to 200
# check how the performance of a model improves or changes
# when we increase the number of trees 

# Create list scores to store the results
scores = []

for n in range(10, 201, 10):

  # Create RandomForestClassifier with 10 models
  # n_estimators are the numbers of models we want to use
  rf = RandomForestClassifier(n_estimators=10, random_state=1)

  # For training the model we use the fit method
  rf.fit(X_train, y_train)

  # Apply our model on X validation and use the first column
  y_pred = rf.predict_proba(X_val)[:, 1]

  # Compute the ROC AUC score
  auc = roc_auc_score(y_val, y_pred)

  # Append list scores with results
  scores.append((n, auc))

In [None]:
# Create dataframe scores using the list scores
df_scores = pd.DataFrame(scores, columns=['n_estimators', 'auc'])
df_scores

In [None]:
# Plot the dataframe scores
plt.plot(df_scores.n_estimators, df_scores.auc)

In [None]:
# Create list scores to store the results
scores = []

# Iterate using the max depth parameters
for d in [5, 10, 15]:
  for n in range(10, 201, 10):

    # Create RandomForestClassifier with 10 models
    # n_estimators are the numbers of models we want to use
    rf = RandomForestClassifier(n_estimators=10, random_state=1)

    # For training the model we use the fit method
    rf.fit(X_train, y_train)

    # Apply our model on X validation and use the first column
    y_pred = rf.predict_proba(X_val)[:, 1]

    # Compute the ROC AUC score
    auc = roc_auc_score(y_val, y_pred)

    # Append list scores with results
    scores.append((d, n, auc))

In [None]:
# Create list with names of the columns of the dataframe scores
columns=['max_depth', 'n_estimators', 'auc']

# Create dataframe scores using the list scores
df_scores = pd.DataFrame(scores, columns=columns)
df_scores.head()

In [None]:
# Create for loop to plot the dataframe scores
for d in [5, 10, 15]:

  # Create dataframe subset with the scores and we filter scores with max depth
  df_subset = df_scores[df_scores.max_depth == d]

  # Plot the dataframe subset with legend
  plt.plot(df_subset.n_estimators, df_subset.auc, label='max_depth=%d' % d)

# Display the legend of the plot
plt.legend()

In [None]:
# Select the best max depth to 10
max_depth = 10