In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor

In [4]:
X = np.load('preprocessed_data/200_data_pts_features.npy')
y = np.load('preprocessed_data/200_data_pts_labels.npy')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

# Shuffle the training data
train_permutation = np.random.permutation(len(X_train))
X_train = X_train[train_permutation]
y_train = y_train[train_permutation]

# Shuffle the testing data
test_permutation = np.random.permutation(len(X_test))
X_test = X_test[test_permutation]
y_test = y_test[test_permutation]
print("X_train:\n", X_train)
print("y_train:\n", y_train)
print("X_test:\n", X_test)
print("y_test:\n", y_test)

X_train:
 [[ 1.50000000e+01  1.90000000e-01  2.00000000e+01 ...  3.38000000e+02
  -2.49944000e+03  3.62855000e+03]
 [ 1.90000000e+02  9.30000000e-01  1.80000000e+02 ...  2.00000000e+00
  -1.60100000e+01  2.88990000e+02]
 [ 3.10000000e+02 -2.19500000e+01  2.64722222e+02 ...  4.60000000e+01
  -3.69310000e+02  7.57605000e+03]
 ...
 [ 2.50000000e+02  2.39780000e+02  2.50000000e+02 ...  4.90000000e+02
  -2.06368000e+03  2.22557000e+03]
 [ 2.50000000e+02 -8.23000000e+00  2.50000000e+02 ...  4.24000000e+02
  -5.74431000e+03  2.74332000e+03]
 [ 3.20000000e+02  2.97133000e+03  1.93529412e+02 ...  4.80000000e+01
  -3.36420000e+02  2.97133000e+03]]
y_train:
 [ 1220.7    125.14    62.83 ...  1292.35 -1632.6   1135.66]
X_test:
 [[ 3.50000000e+02  1.00920000e+02  2.64473684e+02 ...  1.98200000e+03
  -2.52180000e+03  2.79110000e+03]
 [ 1.70000000e+02  1.28067000e+03  1.00833333e+02 ...  2.10000000e+01
  -2.39600000e+01  2.47625000e+03]
 [ 2.50000000e+02  1.00000000e+01  2.16875000e+02 ...  1.34100000

## Normalize Data

In [6]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1343, 23), (1343,), (576, 23), (576,))

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() # Good for XGBoost and Neural Networks

# Fit the scaler on the training data and transform both train and test sets
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
# Print normalized shapes
print("Normalized X_train shape:", X_train_normalized.shape)
print("Normalized X_test shape:", X_test_normalized.shape)

Normalized X_train shape: (1343, 23)
Normalized X_test shape: (576, 23)


In [8]:
# Normalize the target variable for regression
y_scaler = StandardScaler()  
y_train_normalized = y_scaler.fit_transform(y_train.reshape(-1, 1))
y_test_normalized = y_scaler.transform(y_test.reshape(-1, 1))

print("Normalized y_train shape:", y_train_normalized.shape)
print("Normalized y_test shape:", y_test_normalized.shape)

Normalized y_train shape: (1343, 1)
Normalized y_test shape: (576, 1)


# XGBoost

In [9]:
model = XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.3, random_state=42)

model.fit(X_train_normalized, y_train_normalized)

y_pred = model.predict(X_test_normalized)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test_normalized, y_pred)
print(f"MSE: {mse:.2f}")

MSE: 0.88


# XGBoost + PCA

In [12]:
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_train_pca = pca.fit_transform(X_train_normalized)
X_test_pca = pca.transform(X_test_normalized)

In [None]:
model = XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.3, random_state=42)

# Train model
model.fit(X_train_pca, y_train_normalized)

# Make predictions
y_pred = model.predict(X_test_pca)

# Evaluate 
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test_normalized, y_pred)
print(f"MSE: {mse:.2f}")

MSE: 0.65


# DNN

In [14]:
import torch
X_train_normalized_tensor = torch.tensor(X_train_normalized, dtype=torch.float32)
X_test_normalized_tensor = torch.tensor(X_test_normalized, dtype=torch.float32)
y_train_normalized_tensor = torch.tensor(y_train_normalized, dtype=torch.float32)
y_test_normalized_tensor = torch.tensor(y_test_normalized, dtype=torch.float32)

In [15]:

import torch.nn as nn
import torch.optim as optim
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.layer1 = nn.Linear(23, 64)  # Input layer with 23 features
        self.layer2 = nn.Linear(64, 32)  
        self.layer3 = nn.Linear(32, 1)   
    
    def forward(self, x):
        x = torch.relu(self.layer1(x))   
        x = torch.relu(self.layer2(x)) 
        x = self.layer3(x)               # No activation in the output layer for regression
        return x

model = SimpleNN()

criterion = nn.MSELoss()  # MSE for regression
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the model
num_epochs = 100
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train_normalized_tensor)    
    loss = criterion(outputs, y_train_normalized_tensor)
    loss.backward()
    optimizer.step()
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_test_normalized_tensor)
        val_loss = criterion(val_outputs, y_test_normalized_tensor)
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Validation Loss: {val_loss.item():.4f}")


Epoch [10/100], Train Loss: 0.9876, Validation Loss: 0.5320
Epoch [20/100], Train Loss: 0.9799, Validation Loss: 0.5242
Epoch [30/100], Train Loss: 0.9736, Validation Loss: 0.5182
Epoch [40/100], Train Loss: 0.9679, Validation Loss: 0.5131
Epoch [50/100], Train Loss: 0.9623, Validation Loss: 0.5086
Epoch [60/100], Train Loss: 0.9568, Validation Loss: 0.5044
Epoch [70/100], Train Loss: 0.9514, Validation Loss: 0.5003
Epoch [80/100], Train Loss: 0.9458, Validation Loss: 0.4963
Epoch [90/100], Train Loss: 0.9403, Validation Loss: 0.4924
Epoch [100/100], Train Loss: 0.9346, Validation Loss: 0.4886
