In [None]:
# imports
import numpy as np
from csv_utils import df
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from graph_utils import show_frequency, show_numerical_attribute_distribution, show_categorical_attribute_histogram, show_numerical_corellation, show_categorical_corellation
from logistic import split_dataset, logistic_accuracy, MyLogisticRegression
from decision_tree_utils import MyDecisionTreeClassifier

In [None]:
# data exploration
show_frequency(df, 'Revenue')
show_numerical_attribute_distribution(df)
show_categorical_attribute_histogram(df)

In [None]:
# correlation
show_numerical_corellation(df)
show_categorical_corellation(df)

# Logistic Regression

In [None]:
# data preprocessing
label_encoder = LabelEncoder()
categorical_attributes = df.select_dtypes(include=object).columns
for attribute in categorical_attributes:
    df[attribute] = label_encoder.fit_transform(df[attribute])
numerical_attributes = df.select_dtypes(include=np.number).columns
scaler = 0
match scaler:
    case 0:
        scaler = MinMaxScaler()
    case 1:
        scaler = StandardScaler()
    case 2:
        scaler = RobustScaler()
for attribute in numerical_attributes:
    df[attribute] = scaler.fit_transform(df[attribute].values.reshape(-1, 1))

In [None]:
X = df.drop('Revenue', axis=1).values.astype(np.float32)
T = df['Revenue'].values.astype(np.float32)

In [None]:
MANUAL: bool = True
if MANUAL:
    X_train, X_test, T_train, T_test = split_dataset(X, T)
    model = MyLogisticRegression()
else:
    X_train, X_test, T_train, T_test = train_test_split(X, T, test_size=0.8)
    model = LogisticRegression()

In [None]:
model.fit(X_train, T_train)

In [None]:
Y = model.predict(X_test)
if MANUAL:
    acc = logistic_accuracy(T_test, Y)
else:
    acc = accuracy_score(T_test, Y)
print(f"Accuracy: {acc}")

# Decision Trees

In [None]:
# data preprocessing
numerical_attributes = df.select_dtypes(include=np.number).columns
minmax_scaler = MinMaxScaler()

for attribute in numerical_attributes:
    df[attribute] = minmax_scaler.fit_transform(df[attribute].values.reshape(-1, 1))

In [None]:
X = df.drop('Revenue', axis=1).values.astype(np.float32)
T = df['Revenue'].values.astype(np.float32)

In [None]:
MANUAL: bool = True
if MANUAL:
    classes = np.unique(T)
    attributes = df.columns
    model = MyDecisionTreeClassifier("Revenue", classes, attributes)
else:
    X_train, X_test, T_train, T_test = train_test_split(X, T, test_size=0.8)
    model = DecisionTreeClassifier()

In [None]:
model.fit(X_train, T_train)

In [None]:
Y = model.predict(X_test)
acc = accuracy_score(T_test, Y)
print(f"Accuracy: {acc}")