# ECON1611 - Assessment 2: Empirical Project in Python (Individual)

In [None]:
# Module imports
# import modules for use below
# (Note: Modules specific to particular machine learning examples are imported in the relevant code block)
# 'as' allows use of an abbreviated module name
import matplotlib.pyplot as plt     # Matplotlib for low-level plot details
import numpy as np                  # NumPy for fast numeric operations
import pandas as pd                 # Pandas for datasets
import seaborn as sns               # Seaborn for easier plotting

In [None]:
# set a random number generator seed to allow reproducibility
seed = 12345

In [None]:
# from google.colab import files  # uncomment if using colab
# uploades = files.upload()  # uncomment if using colab

In [None]:
# Read the data into a pandas dataframe for futher analysis
df = pd.read_csv('./bank-additional.csv', sep=';')

In [None]:
# View the first few rows to check all looks normal
df.head()

In [None]:
# Depending on your notebook settings this might not show all columns - if it isn't, we can change this
pd.options.display.max_columns = None
df.describe(include='all')

## Data Cleaning
Often the data provided is not in the final form we want to work with. Data cleaning refers to the process whereby we transform the initial data into the final form we want/need to work with.
The following steps do not form part of the assessment, but are a demonstration of some of the steps which may be involved in data cleaning.
In the following example this process includes:
- Removing unneeded data
- Transforming data - in this case, transforming categorical data into binary data (one hot encoding) 

**Notes:**

_-When working with Pandas axis=0 means a row operation and axis=1 means a column operation._

_-The a subset of columns can be selected by creating a list of names e.g. ['var1', 'var2']._

### Remove unneeded data
Unneeded data unnecessarily complicates the machine learning pipeline. Here we drop entire columns which are unneeded

In [None]:
# Note - `inplace=True` affects the current dataframe directly
#  - otherwise we would need to assign the returned dataframe to a new variable

# df.drop(['contact', 'month', 'day_of_week', 'duration', 'campaign', ], axis=1, inplace=True)

### Encode data
Data encoding, or transformation, involves changing the form of the data

**Note:**

_sklearn has it's own perprocessing classes (e.g. LabelBinarizer, OneHotEncoder) which can be used as part of a data processing pipeline to do this. For the sake of those unfamiliar with data processing in Python we have used simpler pandas methods here._

#### Binary Variables

In [None]:
# `y` is currently a text column - encode all 'yes' values as `1`, everything else as `0` 
# and store in a column named 'y_encoded' (this will be our final target array)
df['y_encoded'] = df['y'].apply(lambda row: 1 if row == 'yes' else 0)
df.head(n=50)

In [None]:
# Now repeat for other columns using this encoding
# creating dummy for default or not
df['is_default'] = df['default'].apply(lambda row: 1 if row == 'yes' else 0)
df['is_housing'] = df['housing'].apply(lambda row: 1 if row == 'yes' else 0)
df['is_loan'] = df['loan'].apply(lambda row: 1 if row == 'yes' else 0)

##Treated Variable

In [None]:
# create a binary indicator for whether person was contacted or not in PREVIOUS marketing campaign 
# (pdays = 999 if they weren't contacted at all)
df['treated'] = df['pdays'].apply(lambda row: 0 if row == 999 else 1)
df[['treated', 'pdays']].head()
df['treated'].describe()

In [None]:
# create a binary indicator for whether person was contacted or not in marketing campaign 
# (campaign = NA if they weren't contacted at all) - eveyone was contacted
df['treated'] = df['campaign'].apply(lambda row: 0 if row <= 0 else 1)
df[['treated', 'campaign']].head()
df['treated'].describe()

In [None]:
# create a duration indicator 
df['treated'] = df['duration']
df[['treated', 'duration']].head()

df.head(n=10)

In [None]:
df['treated'].describe()

In [None]:
# Note - `inplace=True` affects the current dataframe directly
#  - otherwise we would need to assign the returned dataframe to a new variable

df.drop(['contact', 'month', 'day_of_week', 'campaign', ], axis=1, inplace=True)

#### Nominal data
Nominal data is data with generally several categories, and for which there is no 'innate' ordering of the categories. This kind of data is generally encoded by creating a number of 'dummy' columns containing binary (yes/no) data - one column for each 'category' in the original data

In [None]:
# the 'marital' column is nominal data - there are several different categories, for which ordering doesn't make sense
# we will use the pandas 'get_dummies' method to create a different binary column for each status
# NOTE: This will create several columns named 'marital_...' for each category
marital_dummies = pd.get_dummies(df['marital'], prefix = 'marital')

# view the 'new' and original columns
pd.concat([df['marital'], marital_dummies], axis=1).head()

In [None]:
# now drop the new 'marital_unknown' variable
marital_dummies.drop('marital_unknown', axis=1, inplace=True)
# merge new dummies into main dataframe
df = pd.concat([df, marital_dummies], axis=1)
df.head()

In [None]:
# repeat for 'job'
job_dummies = pd.get_dummies(df['job'], prefix = 'job')
job_dummies.drop('job_unknown', axis=1, inplace=True)
X = pd.concat([df, job_dummies], axis=1)

# and 'poutcome'
poutcome_dummies = pd.get_dummies(df['poutcome'], prefix = 'poutcome')
df = pd.concat([df, poutcome_dummies], axis=1)
df.head()

# and 'education'
# Note: Education is a Ordinal value (the categories have an innate order,
# and would usually be encoded using an OrdinalEncoder 
# - for simplicity here we are again using simple binary dummy values)
education_dummies = pd.get_dummies(df['education'], prefix = 'education')
education_dummies.drop('education_unknown', axis=1, inplace=True)
df = pd.concat([df, education_dummies], axis=1)
df.head()

### Split out our X and Y data
We split the dataset into a target array (here the column currently called `y_encoded`) and a features matrix (all other columns). By convention these are named `X` and `y`

In [None]:
# NOTE: we use `copy` to ensure that `X` and `y` are not simply views into the existing datagrame
y = df['y_encoded'].copy()
X = df.copy()
X.drop(['y_encoded', 'y'], axis=1, inplace=True)

In [None]:
#drop  the 'original' variables (now encoded as other columns)
X.drop(['job', 'education', 'marital', 'default', 'housing', 'loan', 'pdays', 'poutcome',], axis=1, inplace=True)

In [None]:
# Review the final dataframe
print(X.info())

In [None]:
# Review the final target
y.info()

## Assignment Questions
### 1) Summarise and describe the data
#### a) Print the first 20 rows of data

In [None]:
X.head(n=20)

#### b) Describe the data e.g. mean, median, standard deviation of all the variables

In [None]:
X.describe()

#### c) Count the number of observations in each response category 

In [None]:
# Count the number of observations in each response category
# NOTE: This is easiest done on the data before encoding
# e.g. marital

# Are there any missing values?
print('Missing values?')
print(df['marital'].isnull().values.any())

# Count of unique values
print('Count of unique values')
print(df['marital'].value_counts())


### 2) Graphing
#### a) Basic scatter plot of two features against each other 

In [None]:
# https://seaborn.pydata.org/generated/seaborn.scatterplot.html
sns.scatterplot(data=X, x="nr.employed", y="euribor3m")\
       .set(title="nr.employed x euribor3m", xlabel="nr.employed", ylabel="euribor3m")
plt.show()

#### b) Histogram

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
sns.countplot(x=y, ax=ax)

abs_vals = y.value_counts()
rel_vals = y.value_counts(normalize=True) * 100
labels = [f'{p[0]} ({p[1]:.0f}%)' for p in zip(abs_vals, rel_vals)]

ax.bar_label(container=ax.containers[0], labels=labels)
ax.set_title('Target distribution', fontsize=16, pad=20)

plt.show()

### 3) From the 20 inputs, choose the set of controls you will use for your machine learning models. Justify why you have excluded some variables. (1 mark) 

**Note:**

_Because we are doing this after the data cleaning step, there are now more than 20 inputs - so remember to take account variables which may now have been one-hot encoded into multiple dummy variables_

In [None]:
# uncomment below and choose which columns to exclude
excluded_columns = []
# excluded_columns = ['age', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
#        'euribor3m', 'nr.employed', 'y', 'is_default', 'is_housing', 'is_loan',
#        'treated', 'marital_divorced', 'marital_married', 'marital_single',
#        'poutcome_failure', 'poutcome_nonexistent', 'poutcome_success',
#        'education_basic.4y', 'education_basic.6y', 'education_basic.9y',
#        'education_high.school', 'education_illiterate',
#        'education_professional.course', 'education_university.degree']

X_final = X.drop(excluded_columns, axis=1)
X_final

### 4) Build a classification tree in Python
#### a) split sample into train and test  

In [None]:
# Training and Test Data
from sklearn.model_selection import train_test_split

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
# Setting random_state means the split will always be the same which sometimes is useful.
X_train, X_test, y_train, y_test =\
    train_test_split(X_final, y, test_size=0.3, random_state=seed)

# Output the dimensions of each of the sets
print(f"X_train is {X_train.shape}")
print(f"X_test is {X_test.shape}")
print(f"y_train is {y_train.shape}")
print(f"y_test is {y_test.shape}")

#why keep getting error "inconsistent number of samples"

#### b) Without doing any pruning of the tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Setup a Decision Tree 
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
dt_model = DecisionTreeClassifier(criterion="gini", max_depth=20, min_samples_split=2, random_state=seed)
dt_model.fit(X_train, y_train)
y_predict = dt_model.predict(X_test)

#### c) Draw the tree

In [None]:
from sklearn.tree import plot_tree
# Plot Decision Tree using the lower-level plotting functions
# NOTE: This may take some time to complete
plt.figure(figsize=(12,12))
plot_tree(dt_model, filled=True)
plt.title("Unpruned Tree")
plt.show()

### 6) Calculate feature importance for each feature

In [None]:
# Print 4dp of each feature importance 
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor.feature_importances_
# See the following for an explanation, but note this a regression example so uses MSE rather than GINI
# https://towardsdatascience.com/feature-importance-in-decision-trees-e9450120b445
for i in range(len(dt_model.feature_names_in_)):
    print(f"{dt_model.feature_names_in_[i]}: {dt_model.feature_importances_[i]:.4f}")

### 7) Do GridsearchCV to find the optimal tree and draw the tree

In [None]:
# Importing the sklearn implementation 
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': np.linspace(1, 50, 50, dtype='int16'),
    'min_samples_split': np.linspace(0.1, 1.0, 10),
}

gs = GridSearchCV(
    estimator=DecisionTreeClassifier(criterion='gini', random_state=seed), 
    param_grid=param_grid)

# Fir
gs.fit(X=X_train, y=y_train)

# Predicting the test set
y_pred = gs.predict(X_test)
plt.figure(figsize=(20, 20))
plot_tree(gs.best_estimator_, feature_names=X.columns, filled=True)
plt.show()

In [None]:
# Accuracy of the test set
gs.score(X=X_test, y=y_test)

In [None]:
gs.best_params_

In [None]:
# Print 4dp of each feature importance 
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor.feature_importances_
# See the following for an explanation, but note this a regression example so uses MSE rather than GINI
# https://towardsdatascience.com/feature-importance-in-decision-trees-e9450120b445
for i in range(len(gs.best_estimator_.feature_names_in_)):
    print(f"{gs.best_estimator_.feature_names_in_[i]}: {gs.best_estimator_.feature_importances_[i]:.4f}")

### 9) Run a LASSO model

In [None]:
#LASSO
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

In [None]:
# Here we use a scikit-learn pipeline, incorporating a standard scaler, rather than just the Lasso estimator
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('model',Lasso())
])

In [None]:
search = GridSearchCV(pipeline,
    {'model__alpha':np.arange(0.1,10,0.1)},
    cv = 2, scoring="neg_mean_squared_error")

In [None]:
_ = search.fit(X_train,y_train)

In [None]:
search.best_params_

In [None]:
coefficients = search.best_estimator_.named_steps['model'].coef_

In [None]:
importance = np.abs(coefficients)

In [None]:
importance

In [None]:
np.array(X.columns)[importance > 0]

In [None]:
np.array(X.columns)[importance == 0]