In [None]:
# imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from graph_utils import show_frequency, show_numerical_attribute_distribution, show_categorical_attribute_histogram, show_numerical_corellation, show_categorical_corellation
from logistic import MyLogisticRegression
from decision_tree_utils import MyDecisionTreeClassifier

df: pd.DataFrame = pd.read_csv('dataset.csv')

In [None]:
# data exploration
show_frequency(df, 'Revenue')
show_numerical_attribute_distribution(df)
show_categorical_attribute_histogram(df)

In [None]:
# correlation
show_numerical_corellation(df)
show_categorical_corellation(df)

In [None]:
# testing
scalers = [MinMaxScaler(), StandardScaler(), RobustScaler()]
# encode categorical attributes
label_encoder = LabelEncoder()
categorical_attributes = df.select_dtypes(include=object).columns
for attribute in categorical_attributes:
    df[attribute] = label_encoder.fit_transform(df[attribute])
for scaler in scalers:
    print(f'\n======== Using {type(scaler).__name__} ========')
    # scale numerical attributes
    new_df = df.copy()
    numerical_attributes = new_df.select_dtypes(include=np.number).columns
    for attribute in numerical_attributes:
        new_df[attribute] = scaler.fit_transform(new_df[attribute].values.reshape(-1, 1))
    X = new_df.drop('Revenue', axis=1).values.astype(np.float32)
    T = new_df['Revenue'].values.astype(np.float32)
    # generate data splits
    splits = []
    for _ in range(10):
        X_train, X_test, T_train, T_test = train_test_split(X, T, test_size=0.8)
        splits.append((X_train, X_test, T_train, T_test))
    # train and test models
    for model in [MyLogisticRegression(), LogisticRegression(), MyDecisionTreeClassifier(df, T), DecisionTreeClassifier()]:
        print(f'\n== Using {type(model).__name__} ==\n')
        for X_train, X_test, T_train, T_test in splits:
            model.fit(X_train, T_train)
            Y = model.predict(X_test)
            acc = accuracy_score(T_test, Y)
            print(f'Accuracy: {acc}')