In [None]:
# imports
import numpy as np
from csv_utils import df
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from graph_utils import show_frequency, show_numerical_attribute_distribution, show_categorical_attribute_histogram, show_numerical_corellation, show_categorical_corellation
from logistic import split_dataset, train_logistic, predict_logistic, accuracy

In [None]:
# data exploration
show_frequency(df, 'Revenue')
show_numerical_attribute_distribution(df)
show_categorical_attribute_histogram(df)

In [None]:
# correlation
show_numerical_corellation(df)
show_categorical_corellation(df)

In [None]:
# data preprocessing
label_encoder = LabelEncoder()
categorical_attributes = df.select_dtypes(include=object).columns
for attribute in categorical_attributes:
    df[attribute] = label_encoder.fit_transform(df[attribute])
numerical_attributes = df.select_dtypes(include=np.number).columns
minmax_scaler = MinMaxScaler()
for attribute in numerical_attributes:
    df[attribute] = minmax_scaler.fit_transform(df[attribute].values.reshape(-1, 1))
#standard_scaler = StandardScaler()
#for attribute in numerical_attributes:
#    df[attribute] = standard_scaler.fit_transform(df[attribute])
#robust_scaler = RobustScaler()
#for attribute in numerical_attributes:
#    df[attribute] = robust_scaler.fit_transform(df[attribute])

In [None]:
X = df.drop('Revenue', axis=1).values.astype(np.float32)
T = df['Revenue'].values.astype(np.float32)
X_train, T_train, X_test, T_test = split_dataset(X, T)

In [None]:
# manual implementation
W = train_logistic(X_train, T_train)
Y = predict_logistic(X_test, W)
acc = accuracy(Y, T_test)
print(f"Accuracy: {acc}")

In [None]:
# scikit-learn implementation
model = LogisticRegression()
model.fit(X_train, T_train)
Y = model.predict(X_test)
acc = accuracy(Y, T_test)
print(f"Accuracy: {acc}")