In [None]:
# imports
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from graph_utils import show_frequency, show_numerical_attribute_distribution, show_categorical_attribute_histogram, show_numerical_corellation, show_categorical_corellation
from logistic import MyLogisticRegression
from decision_tree_utils import MyDecisionTreeClassifier

df: pd.DataFrame = pd.read_csv('dataset.csv')

In [None]:
# data exploration
show_frequency(df, 'Revenue')
show_numerical_attribute_distribution(df)
show_categorical_attribute_histogram(df)

In [None]:
# correlation
show_numerical_corellation(df)
show_categorical_corellation(df)

In [None]:
def print_metrics(T_test, Y):
    print(f'Accuracy: {accuracy_score(T_test.values.astype(np.float32), Y)}. Precision: {precision_score(T_test.values.astype(np.float32), Y)}. Recall: {recall_score(T_test.values.astype(np.float32), Y)}. F1: {f1_score(T_test.values.astype(np.float32), Y)}')

In [None]:
# encode categorical attributes
label_encoder = LabelEncoder()
categorical_attributes = df.select_dtypes(include=object).columns
for attribute in categorical_attributes:
    df[attribute] = label_encoder.fit_transform(df[attribute])

# generate data splits
numerical_attributes = df.select_dtypes(include=np.number).columns
X = df.drop('Revenue', axis=1)
T = df['Revenue']
splits = []
for _ in range(10):
    X_train, X_test, T_train, T_test = train_test_split(X, T, test_size=0.8)
    splits.append((X_train, X_test, T_train, T_test))

for i, split in enumerate(splits):
    print(f"\n======== SPLIT {i} ========")
    new_split = deepcopy(split)
    X_train, X_test, T_train, T_test = new_split
    for scaler in [MinMaxScaler(), StandardScaler(), RobustScaler()]:
        print(f'\n==== Using {type(scaler).__name__} ====')
        new_X_train = deepcopy(X_train)
        new_X_test = deepcopy(X_test)
        # scale numerical attributes
        for attribute in numerical_attributes:
            new_X_train[attribute] = scaler.fit_transform(new_X_train[attribute].values.reshape(-1, 1))
            new_X_test[attribute] = scaler.transform(new_X_test[attribute].values.reshape(-1, 1))
        for model in [MyLogisticRegression(), LogisticRegression()]:
            print(f'\n== Using {type(model).__name__} ==\n')
            model.fit(new_X_train.values.astype(np.float32), T_train.values.astype(np.float32))
            Y = model.predict(new_X_test.values.astype(np.float32))
            print_metrics(T_test, Y)
    for scaler in [MinMaxScaler(), StandardScaler(), RobustScaler(), None]:
        print(f'\n==== Using {type(scaler).__name__} ====')
        new_X_train = deepcopy(X_train)
        new_X_test = deepcopy(X_test)
        # scale numerical attributes
        if scaler is not None:
            for attribute in numerical_attributes:
                new_X_train[attribute] = scaler.fit_transform(new_X_train[attribute].values.reshape(-1, 1))
                new_X_test[attribute] = scaler.transform(new_X_test[attribute].values.reshape(-1, 1))
        for depth in range(3, 7):
            for model in [DecisionTreeClassifier(), MyDecisionTreeClassifier(df, "Revenue")]:
                print(f'\n== Using {type(model).__name__} with depth {depth} ==\n')
                model.max_depth = depth
                model.fit(new_X_train.values.astype(np.float32), T_train.values.astype(np.float32))
                Y = model.predict(new_X_test.values.astype(np.float32))
                print_metrics(T_test, Y)