# Machine learning process

Imports

In [1]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer
import os
import pandas as pd

Open data file

In [2]:
DATA_DIR_PATH = os.path.join('..', 'data')

DATA_FILENAME = 't-shirts.csv'
DATA_FILE_PATH = os.path.join(DATA_DIR_PATH, DATA_FILENAME)

df = pd.read_csv(DATA_FILE_PATH)

In [3]:
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])

In [4]:
Y = df['demand']
X = df.drop('demand', axis= 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Define data preprocessing methods

In [5]:
preprocessing_methods = {
    "none": lambda X: X,
    "normalization": MinMaxScaler(),
    "standardization": StandardScaler()
}

processed_data = {}
for method, transformer in preprocessing_methods.items():
    if method == "none":
        X_train_transformed = X_train
        X_test_transformed = X_test
    else:
        X_train_transformed = transformer.fit_transform(X_train)
        X_test_transformed = transformer.transform(X_test)
    processed_data[method] = (X_train_transformed, X_test_transformed)

Define classifiers and their parameters

In [6]:
classifiers = {
    'naive_bayes': CategoricalNB(),
    'decision_tree': DecisionTreeClassifier(),
}

param_grids = {
    'naive_bayes': {
        'fit_prior': [True, False],
        'class_prior': [None, [0.3, 0.3, 0.4]],
        'alpha': [0.01, 0.1, 1.0]
    },
    'decision_tree': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 15],
        'min_samples_split': [2, 5, 10]
    },
}

Test classifiers with various parameter combinations,
return best results for each classifier with each preprocessing method

In [7]:
best_models = {}
for preproc_method, (X_train_proc, X_test_proc) in processed_data.items():
    for clf_name, clf in classifiers.items():
        if not (preproc_method=="standardization" and clf_name=="naive_bayes"):
            grid_search = GridSearchCV(clf, param_grids[clf_name], cv=5, scoring='accuracy')
            grid_search.fit(X_train_proc, Y_train)
            
            best_models[(preproc_method, clf_name)] = grid_search.best_estimator_
            print(f"Best parameters for {clf_name} with {preproc_method} preprocessing: {grid_search.best_params_}")
            print(f"Best cross-validation accuracy: {grid_search.best_score_}")

Best parameters for naive_bayes with none preprocessing: {'alpha': 0.01, 'class_prior': None, 'fit_prior': True}
Best cross-validation accuracy: 0.8135
Best parameters for decision_tree with none preprocessing: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2}
Best cross-validation accuracy: 0.9698125
Best parameters for naive_bayes with normalization preprocessing: {'alpha': 0.01, 'class_prior': None, 'fit_prior': True}
Best cross-validation accuracy: 0.6893125
Best parameters for decision_tree with normalization preprocessing: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2}
Best cross-validation accuracy: 0.9698125
Best parameters for decision_tree with standardization preprocessing: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2}
Best cross-validation accuracy: 0.96975


Compare results using various metrics

In [8]:
for (preproc_method, clf_name), model in best_models.items():
    X_train_proc, X_test_proc = processed_data[preproc_method]
    y_pred = model.predict(X_test_proc)
    
    accuracy = accuracy_score(Y_test, y_pred)
    precision = precision_score(Y_test, y_pred, average='macro')
    recall = recall_score(Y_test, y_pred, average='macro')
    f1 = f1_score(Y_test, y_pred, average='macro')
    print(f"Metrics for {clf_name} with {preproc_method} preprocessing:")
    print(f"\tAccuracy: {accuracy}")
    print(f"\tPrecision: {precision}")
    print(f"\tRecall: {recall}")
    print(f"\tf1: {f1}")

Metrics for naive_bayes with none preprocessing:
	Accuracy: 0.823
	Precision: 0.8261190506350203
	Recall: 0.7190346630461844
	f1: 0.7469323387753914
Metrics for decision_tree with none preprocessing:
	Accuracy: 0.972
	Precision: 0.9659443832312778
	Recall: 0.9593296714758556
	f1: 0.9625697038081192
Metrics for naive_bayes with normalization preprocessing:
	Accuracy: 0.69975
	Precision: 0.7370584988817893
	Recall: 0.6271518950763207
	f1: 0.657791981957296
Metrics for decision_tree with normalization preprocessing:
	Accuracy: 0.972
	Precision: 0.9659443832312778
	Recall: 0.9593296714758556
	f1: 0.9625697038081192
Metrics for decision_tree with standardization preprocessing:
	Accuracy: 0.972
	Precision: 0.9659443832312778
	Recall: 0.9593296714758556
	f1: 0.9625697038081192
