In [27]:
# imports

import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')

torch.manual_seed(1234)
np.random.seed(1234)


In [28]:
# training data, test data
X_train = pd.read_csv("../Data/X_train.csv")
y_train = pd.read_csv("../Data/y_train.csv")

X_test = pd.read_csv("../Data/X_test.csv")
y_test = pd.read_csv("../Data/y_test.csv")


In [29]:
# preprocessing (drop id, one hot encode)

X_train = X_train.drop(columns=["User_ID"])
X_test = X_test.drop(columns=["User_ID"])

X_train = pd.get_dummies(X_train, columns=["Gender", "Social_Media_Platform"], drop_first=True)
X_test = pd.get_dummies(X_test, columns=["Gender", "Social_Media_Platform"], drop_first=True)

X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_train = y_train.values.flatten()
y_test = y_test.values.flatten()

# Neural Network Best Model Eval

In [30]:
# define model

class NeuralNet(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        return self.layers(x)


In [31]:
# insert best params

best_params = pd.read_csv("./best_params/best_nn_params.csv")

best_hidden = int(best_params["hidden_dim"])
best_lr = float(best_params["lr"])
best_batch = int(best_params["batch_size"])


In [32]:
# retrain best model on training set
input_dim = X_train_scaled.shape[1]

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).reshape(-1,1)

train_loader = DataLoader(
    TensorDataset(X_train_tensor, y_train_tensor),
    batch_size=best_batch,
    shuffle=True
)

model = NeuralNet(input_dim=input_dim, hidden_dim=best_hidden)
optimizer = torch.optim.SGD(model.parameters(), lr=best_lr)
criterion = nn.MSELoss()

for epoch in range(100):  # same number of epochs used in training
    model.train()
    for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(x_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()


In [33]:
# evaluate model on the test set

model.eval()

X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

with torch.no_grad():
    y_pred = model(X_test_tensor).numpy().flatten()

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Final Test Set Performance:")
print(f"   MSE: {mse:.4f}")
print(f"   R²:  {r2:.4f}")


Final Test Set Performance:
   MSE: 0.9281
   R²:  0.5413


# Support Vector Machine

In [34]:
'''
# load best parameters
best_params_svm = pd.read_csv("./best_params/best_svm_params.csv")

# extract best parameters
best_C = float(best_params_svm["C"])
best_gamma = float(best_params_svm["gamma"])
best_kernel = best_params_svm["kernel"].iloc[0]
'''

'\n# load best parameters\nbest_params_svm = pd.read_csv("./best_params/best_svm_params.csv")\n\n# extract best parameters\nbest_C = float(best_params_svm["C"])\nbest_gamma = float(best_params_svm["gamma"])\nbest_kernel = best_params_svm["kernel"].iloc[0]\n'

In [35]:
# enter in best params Best parameters: {'C': 0.1, 'kernel': 'linear'}
best_C = 0.1
best_gamma = 'kernel'
best_kernel = 'linear'

In [36]:
# retrain best model on full training set
svm_model = SVR(
    C=best_C,
    gamma=best_gamma,
    kernel=best_kernel,
    random_state=1234
)
svm_model.fit(X_train_scaled, y_train)

TypeError: SVR.__init__() got an unexpected keyword argument 'random_state'

In [None]:
# evaluate on test set
y_pred_svm = svm_model.predict(X_test_scaled)

mse_svm = mean_squared_error(y_test, y_pred_svm)
r2_svm = r2_score(y_test, y_pred_svm)

print("=" * 50)
print("Support Vector Machine (SVM) - Test Set Performance:")
print("=" * 50)
print(f"Best Parameters: C={best_C}, gamma={best_gamma}, kernel='{best_kernel}'")
print(f"   MSE: {mse_svm:.4f}")
print(f"   R²:  {r2_svm:.4f}")
print("=" * 50)

# ElasticNet Regression Evaluation

In [None]:
'''
# load best parameters
best_params_enet = pd.read_csv("./best_params/best_elasticnet_params.csv")

# extract best parameters
best_alpha = float(best_params_enet["alpha"])
best_l1_ratio = float(best_params_enet["l1_ratio"])
'''

In [None]:
# manually input best params

best_alpha = 0.01
best_l1_ratio = 0.9

In [None]:
# retrain best model on full training set
enet_model = ElasticNet(
    alpha=best_alpha,
    l1_ratio=best_l1_ratio,
    random_state=1234,
    max_iter=10000  # increased for convergence
)
enet_model.fit(X_train_scaled, y_train)

In [None]:
# evaluate on test set
y_pred_enet = enet_model.predict(X_test_scaled)

mse_enet = mean_squared_error(y_test, y_pred_enet)
r2_enet = r2_score(y_test, y_pred_enet)

print("=" * 50)
print("ElasticNet Regression - Test Set Performance:")
print("=" * 50)
print(f"Best Parameters: alpha={best_alpha}, l1_ratio={best_l1_ratio}")
print(f"   MSE: {mse_enet:.4f}")
print(f"   R²:  {r2_enet:.4f}")
print("=" * 50)

# Ensemble Methods Evaluation

In [None]:
# Note: Random Forest typically doesn't require feature scaling
#X_train_final = X_train_processed.values
#X_test_final = X_test_processed.values

In [None]:
'''
# load best parameters
best_params_rf = pd.read_csv("./best_params/best_rf_params.csv")

# extract best parameters
best_n_estimators = int(best_params_rf["n_estimators"])
best_max_depth = int(best_params_rf["max_depth"]) if pd.notna(best_params_rf["max_depth"].iloc[0]) else None
best_min_samples_split = int(best_params_rf["min_samples_split"])
best_min_samples_leaf = int(best_params_rf["min_samples_leaf"])
'''

In [None]:
#random forest best params: Best parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
best_n_estimators = 200
best_max_depth = 10
best_min_samples_split = 5
best_min_samples_leaf = 2

In [None]:
# retrain best model on full training set


rf_model = RandomForestRegressor(
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_samples_split=best_min_samples_split,
    min_samples_leaf=best_min_samples_leaf,
    random_state=1234,
    n_jobs=-1  # use all available cores
)
rf_model.fit(X_train_final, y_train)


In [None]:
# evaluate on test set
y_pred_rf = rf_model.predict(X_test_final)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("=" * 50)
print("Random Forest (Ensemble) - Test Set Performance:")
print("=" * 50)
print(f"Best Parameters: n_estimators={best_n_estimators}, max_depth={best_max_depth}")
print(f"                min_samples_split={best_min_samples_split}, min_samples_leaf={best_min_samples_leaf}")
print(f"   MSE: {mse_rf:.4f}")
print(f"   R²:  {r2_rf:.4f}")
print("=" * 50)