# Creating classification graphs

In [1]:
import sys
import glob
import math
import json
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

### Accuracy score boxplot

In [2]:
linear_regression_path = 'logistic_regression/repeated_metrics.json'
decision_tree_path = 'decision_tree_classifier/repeated_metrics.json'
random_forest_path = 'random_forest_classifier/repeated_metrics.json'
xgboost_path = 'xgboost_classifier/repeated_metrics.json'
linear_regression = pd.DataFrame(json.load(open(linear_regression_path, 'r'))).transpose()
decision_tree = pd.DataFrame(json.load(open(decision_tree_path, 'r'))).transpose()
random_forest = pd.DataFrame(json.load(open(random_forest_path, 'r'))).transpose()
xgboost = pd.DataFrame(json.load(open(xgboost_path, 'r'))).transpose()

validation_accuracy = pd.concat([
    linear_regression['Validation accuracy score'],
    decision_tree['Validation accuracy score'],
    random_forest['Validation accuracy score'],
    xgboost['Validation accuracy score']
    ], axis=1)

validation_accuracy.set_axis([
    'Linear Regression',
    'Decision Trees',
    'Random Forest',
    'XGBoost'
    ], axis=1, inplace=True)
    
box_plot = px.box(
    validation_accuracy,
    title='Validation accuracy score of classification models trained over 1000 different seeds',
    labels={'value': 'Validation set accuracy', 'variable': 'Classification model'},
    orientation='h'
)
box_plot.update_layout(template='plotly_dark')
box_plot.write_image('../../../README-images/classification-box-plot.png', scale=20)
box_plot.show()

### Mean accuracy bar chart

In [3]:
logistic_regression_path = 'logistic_regression/summary_metrics.json'
decision_tree_path = 'decision_tree_classifier/summary_metrics.json'
random_forest_path = 'random_forest_classifier/summary_metrics.json'
xgboost_path = 'xgboost_classifier/summary_metrics.json'
logistic_regression = json.load(open(logistic_regression_path, 'r'))['Validation accuracy score']
decision_tree = json.load(open(decision_tree_path, 'r'))['Validation accuracy score']
random_forest = json.load(open(random_forest_path, 'r'))['Validation accuracy score']
xgboost = json.load(open(xgboost_path, 'r'))['Validation accuracy score']

validation_mean_dict = {
    'logistic Regression': logistic_regression['mean'],
    'Decision Trees': decision_tree['mean'],
    'Random Forest': random_forest['mean'],
    'XGBoost': xgboost['mean']
}

bar_chart = px.bar(
    x=validation_mean_dict.values(),
    y=validation_mean_dict.keys(),
    title='Mean valdiation accuracy score of classification models trained over 1000 different seeds',
    labels={'x': 'Mean validation accuracy score', 'y': 'Classification model', 'color': 'Accuracy'},
    color=validation_mean_dict.values(),
    color_continuous_scale='solar',
    orientation='h',
    template='plotly_dark',
)
bar_chart.update_layout(yaxis={'categoryorder': 'total descending'})
bar_chart.update_xaxes(range=[0.3, 0.4])
bar_chart.write_image('../../../README-images/classification-mean-accuracy-score.png', scale=20)
bar_chart.show()

### Mean F1 bar chart

In [4]:
logistic_regression_path = 'logistic_regression/summary_metrics.json'
decision_tree_path = 'decision_tree_classifier/summary_metrics.json'
random_forest_path = 'random_forest_classifier/summary_metrics.json'
xgboost_path = 'xgboost_classifier/summary_metrics.json'
logistic_regression = json.load(open(logistic_regression_path, 'r'))['Validation F1 score']
decision_tree = json.load(open(decision_tree_path, 'r'))['Validation F1 score']
random_forest = json.load(open(random_forest_path, 'r'))['Validation F1 score']
xgboost = json.load(open(xgboost_path, 'r'))['Validation F1 score']

validation_mean_dict = {
    'logistic Regression': logistic_regression['mean'],
    'Decision Trees': decision_tree['mean'],
    'Random Forest': random_forest['mean'],
    'XGBoost': xgboost['mean']
}

bar_chart = px.bar(
    x=validation_mean_dict.values(),
    y=validation_mean_dict.keys(),
    title='Mean valdiation F1 score of classification models trained over 1000 different seeds',
    labels={'x': 'Mean validation F1score', 'y': 'Classification model', 'color': 'F1'},
    color=validation_mean_dict.values(),
    color_continuous_scale='solar',
    orientation='h',
    template='plotly_dark',
)
bar_chart.update_layout(yaxis={'categoryorder': 'total descending'})
bar_chart.update_xaxes(range=[0.3, 0.38])
bar_chart.write_image('../../../README-images/classification-mean-f1-score.png', scale=20)
bar_chart.show()

### Test logistic regression model

In [5]:
X = pd.read_csv('../../../project/dataframes/numerical_data.csv', index_col=0)
y = pd.read_csv('../../../project/dataframes/cleaned_dataset.csv', index_col=0)['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.25, random_state=13)
model = LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_validation_pred = model.predict(X_validation)
y_test_pred = model.predict(X_test)
accuracy_train = accuracy_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred, average='macro')
accuracy_validation = accuracy_score(y_validation, y_validation_pred)
f1_validation = f1_score(y_validation, y_validation_pred, average='macro')
accuracy_test = accuracy_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred, average='macro')

dict = {'Train': {'accuracy': accuracy_train, 'F1': f1_train}, 'Test': {'accuracy': accuracy_test, 'F1': f1_test}}
df = pd.DataFrame(dict).transpose()
df

Unnamed: 0,accuracy,F1
Train,0.449438,0.417647
Test,0.382022,0.346718


In [6]:
rmse_bar = px.bar(
    df['accuracy'],
    title='Logistic Regression accuracy',
    labels={'index': '', 'value': 'Accuracy'},
    template='plotly_dark',
)
rmse_bar.update_layout(showlegend=False)
rmse_bar.write_image('../../../README-images/train-test-accuracy.png', scale=20)
rmse_bar.show()

r2_bar = px.bar(
    df['F1'],
    title='Logistic Regression F1 score',
    labels={'index': '', 'value': 'F1 score'},
    template='plotly_dark',
)
r2_bar.update_layout(showlegend=False)
r2_bar.write_image('../../../README-images/train-test-f1.png', scale=20)
r2_bar.show()