In [None]:
# imports
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from graph_utils import show_frequency, show_numerical_attribute_distribution, show_categorical_attribute_histogram, show_numerical_corellation, show_categorical_corellation
from logistic import MyLogisticRegression
from decision_tree_utils import MyDecisionTreeClassifier

df: pd.DataFrame = pd.read_csv('dataset.csv')

In [None]:
def print_metrics(T_test, Y) -> tuple:
    acc = accuracy_score(T_test.values.astype(np.float32), Y)
    prec = precision_score(T_test.values.astype(np.float32), Y)
    recall = recall_score(T_test.values.astype(np.float32), Y)
    f1 = f1_score(T_test.values.astype(np.float32), Y)
    print(f'Accuracy: {acc}. Precision: {prec}. Recall: {recall}. F1: {f1}')
    return acc, prec, recall, f1

In [None]:
# generate data splits
X = df.drop('Revenue', axis=1)
T = df['Revenue']
splits = []
for i in range(10):
    X_train, X_test, T_train, T_test = train_test_split(X, T, train_size=0.8)
    splits.append((X_train, X_test, T_train, T_test))
    numerical_attributes = X_train[["Administrative", "Administrative_Duration", "Informational", "Informational_Duration", "ProductRelated", "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", "SpecialDay"]]
    categorical_attributes = X_train[["Month", "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend"]]
    show_frequency(T_train, 'Revenue', f"{i} train")
    show_numerical_attribute_distribution(numerical_attributes, f"{i} train")
    show_categorical_attribute_histogram(categorical_attributes, f"{i} train")
    show_numerical_corellation(numerical_attributes, T_train, f"{i} train")
    show_categorical_corellation(categorical_attributes, T_train, f"{i} train")
    numerical_attributes = X_test[["Administrative", "Administrative_Duration", "Informational", "Informational_Duration", "ProductRelated", "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", "SpecialDay"]]
    categorical_attributes = X_test[["Month", "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend"]]
    show_frequency(T_test, 'Revenue', f"{i} test")
    show_numerical_attribute_distribution(numerical_attributes, f"{i} test")
    show_categorical_attribute_histogram(categorical_attributes, f"{i} test")
    show_numerical_corellation(numerical_attributes, T_test, f"{i} test")
    show_categorical_corellation(categorical_attributes, T_test, f"{i} test")
    

accuracy_dict = {}
precision_dict = {}
recall_dict = {}
f1_dict = {}

numerical_attributes = ["Administrative", "Administrative_Duration", "Informational", "Informational_Duration", "ProductRelated", "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", "SpecialDay"]
categorical_attributes = ["Month", "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend"]

for i, split in enumerate(splits):
    print(f"\n======== SPLIT {i} ========")
    new_split = deepcopy(split)
    X_train, X_test, T_train, T_test = new_split
    for scaler in [MinMaxScaler(), StandardScaler(), RobustScaler()]:
        print(f'\n==== Using {type(scaler).__name__} ====')
        new_X_train = deepcopy(X_train)
        new_X_test = deepcopy(X_test)
        # encode categorical attributes
        label_encoder = LabelEncoder()
        for attribute in categorical_attributes:
            new_X_train[attribute] = label_encoder.fit_transform(new_X_train[attribute])
            new_X_test[attribute] = label_encoder.fit_transform(new_X_test[attribute])
        # scale attributes
        new_X_train[new_X_train.columns] = scaler.fit_transform(new_X_train[new_X_train.columns])
        new_X_test[new_X_test.columns] = scaler.fit_transform(new_X_test[new_X_test.columns])
        for model in [MyLogisticRegression(), LogisticRegression()]:
            print(f'\n== Using {type(model).__name__} ==\n')
            model.fit(new_X_train.values.astype(np.float32), T_train.values.astype(np.float32))
            Y = model.predict(new_X_test.values.astype(np.float32))
            results = print_metrics(T_test, Y)
            name = f'{type(model).__name__}_{type(scaler).__name__}'
            if name not in accuracy_dict:
                accuracy_dict[name] = []
                precision_dict[name] = []
                recall_dict[name] = []
                f1_dict[name] = []
            accuracy_dict[name].append(results[0])
            precision_dict[name].append(results[1])
            recall_dict[name].append(results[2])
            f1_dict[name].append(results[3])

    for scaler in [MinMaxScaler(), StandardScaler(), RobustScaler(), None]:
        print(f'\n==== Using {type(scaler).__name__} ====')
        new_X_train = deepcopy(X_train)
        new_X_test = deepcopy(X_test)
        # encode categorical attributes
        for attribute in categorical_attributes:
            new_X_train[attribute] = label_encoder.fit_transform(new_X_train[attribute])
            new_X_test[attribute] = label_encoder.fit_transform(new_X_test[attribute])
        # scale numerical attributes
        if scaler is not None:
            new_X_train[new_X_train.columns] = scaler.fit_transform(new_X_train[new_X_train.columns])
            new_X_test[new_X_test.columns] = scaler.fit_transform(new_X_test[new_X_test.columns])
        for depth in range(3, 7):
            for model in [DecisionTreeClassifier(), MyDecisionTreeClassifier(df, "Revenue")]:
                print(f'\n== Using {type(model).__name__} with depth {depth} ==\n')
                model.max_depth = depth
                model.fit(new_X_train.values.astype(np.float32), T_train.values.astype(np.float32))
                Y = model.predict(new_X_test.values.astype(np.float32))
                results = print_metrics(T_test, Y)
                name = f'{type(model).__name__}_{type(scaler).__name__}_depth_{depth}'
                if name not in accuracy_dict:
                    accuracy_dict[name] = []
                    precision_dict[name] = []
                    recall_dict[name] = []
                    f1_dict[name] = []
                accuracy_dict[name].append(results[0])
                precision_dict[name].append(results[1])
                recall_dict[name].append(results[2])
                f1_dict[name].append(results[3])

writer: pd.ExcelWriter = pd.ExcelWriter('results.xlsx')
df = pd.DataFrame(accuracy_dict)
df.to_excel(writer, sheet_name='accuracy')
df = pd.DataFrame(precision_dict)
df.to_excel(writer, sheet_name='precision')
df = pd.DataFrame(recall_dict)
df.to_excel(writer, sheet_name='recall')
df = pd.DataFrame(f1_dict)
df.to_excel(writer, sheet_name='f1')
writer.close()