In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

import joblib

In [2]:
# Reading the dataset

df = pd.read_csv('Data/preprocessed_dataset.csv')

le = LabelEncoder()
df["happiness_classification"] = le.fit_transform(df["happiness_classification"])
df["happiness_classification"] = df["happiness_classification"].replace({0: 1, 1: 0})

df.head()

Unnamed: 0,happiness_classification,norm_imports,norm_exports,fear,anger,surprise,sadness,disgust,joy,anticipation,year,country
0,0,13.643921,11.979343,0.04401,0.02934,0.03423,0.031785,0.026895,0.056235,0.100244,2005,Mexico
1,0,13.469564,15.55257,0.060606,0.02852,0.033868,0.035651,0.024955,0.057041,0.112299,2005,Japan
2,1,6.874739,7.213294,0.072381,0.049524,0.04,0.038095,0.034286,0.097143,0.091429,2005,Belgium
3,0,6.624081,4.183923,0.05984,0.043883,0.019947,0.041223,0.031915,0.070479,0.093085,2005,Pakistan
4,1,12.099429,11.116681,0.070225,0.043539,0.036517,0.036517,0.02809,0.081461,0.102528,2005,France


In [3]:
# Selecting variables of interest

X = df[['norm_imports', 'norm_exports']]
y = df['happiness_classification']

In [4]:
# Creating train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [5]:
# Defining hyperparameter values for grid search.

param_grid = {
    'max_depth': [1, 3, 5, 7, 10],
    'min_samples_split': [5, 10, 15, 20, 50],
    'min_samples_leaf': [1, 3, 5, 7, 10]
}

In [6]:
# Specifying the type of model and cross-validation method.

tree_clf = DecisionTreeClassifier(criterion='gini', random_state=5)

grid_search = GridSearchCV(tree_clf, param_grid, cv=10, scoring='accuracy')

In [7]:
# Fitting the model through cross-validation.

grid_search.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=5),
             param_grid={'max_depth': [1, 3, 5, 7, 10],
                         'min_samples_leaf': [1, 3, 5, 7, 10],
                         'min_samples_split': [5, 10, 15, 20, 50]},
             scoring='accuracy')

In [8]:
# Printing best hyperparameter values and best training accuracy.

print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 15}
Best Accuracy: 0.7446501295816365


In [9]:
# Obtaining and training the best trained model.

best_tree = grid_search.best_estimator_

best_tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=10, min_samples_leaf=3, min_samples_split=15,
                       random_state=5)

In [10]:
# Calculating accuracy on test set.

accuracy_on_test = best_tree.score(X_test, y_test)
print("Accuracy on Test Set:", accuracy_on_test)

Accuracy on Test Set: 0.7783783783783784


In [11]:
# Exporting best model for use in evaluation.

joblib.dump(best_tree, 'Models/trading_tree_clf.pkl')

['Models/trading_tree_clf.pkl']