# 3. Modeling

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
df_preprocessed = pd.read_parquet('../data/ml_dataset_preprocessed.parquet')

In [6]:
df_modeling = df_preprocessed.drop(columns=[f'sensor_{i}' for i in range(1, 6)]).reset_index(drop=True)
df_modeling = df_modeling.dropna().reset_index(drop=True)
df_modeling.shape[0]

5151

In [7]:
X = df_modeling.drop(columns=['device_id', 'timestamp', 'label'])
y = df_modeling.label

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [10]:
gradient_boosting = GradientBoostingClassifier(random_state=42)
logistic_regression = LogisticRegression(solver='lbfgs', random_state=42)
random_forest = RandomForestClassifier(random_state=42)

In [11]:
gradient_boosting.fit(X_train, y_train)
logistic_regression.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [12]:
y_pred_gb = gradient_boosting.predict(X_test)
y_pred_lr = logistic_regression.predict(X_test)
y_pred_rf = random_forest.predict(X_test)

In [13]:
print('--- Gradient Boosting ---')
print(f"Accuracy: {round(accuracy_score(y_test, y_pred_gb), 4)}")
print(f"F1 (weighted): {round(f1_score(y_test, y_pred_gb, average='weighted'), 4)}")
print('')
print('--- Logistic Regression ---')
print(f"Accuracy: {round(accuracy_score(y_test, y_pred_lr), 4)}")
print(f"F1 (weighted): {round(f1_score(y_test, y_pred_lr, average='weighted'), 4)}")
print('')
print('--- Random Forest ---')
print(f"Accuracy: {round(accuracy_score(y_test, y_pred_rf), 4)}")
print(f"F1 (weighted): {round(f1_score(y_test, y_pred_rf, average='weighted'), 4)}")

--- Gradient Boosting ---
Accuracy: 0.9752
F1 (weighted): 0.9751

--- Logistic Regression ---
Accuracy: 0.9736
F1 (weighted): 0.9736

--- Random Forest ---
Accuracy: 0.9744
F1 (weighted): 0.9743
