# Polynomial model generation


load all the libraries first

In [27]:
import numpy as np
import pandas as pd
import torch
import json
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt

load all the classes/functions

In [28]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx, :-1]
        y = self.data[idx, -1]
        return x, y

class LinearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(LinearRegression, self).__init__()
        self.linear = torch.nn.Linear(inputSize, outputSize)

    def forward(self, x):
        out = self.linear(x)
        return out

def train_model_lasso(model, train_loader, learning_rate, epochs, l1_lambda, device):
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    mse_history = []

    for epoch in range(epochs):
        for inputs, labels in train_loader:
            inputs, labels = inputs.float().to(device), labels.float().to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            l1_norm = sum(p.abs().sum() for p in model.parameters())

            loss = criterion(outputs.view(-1), labels) + l1_lambda * l1_norm
            loss.backward()
            optimizer.step()

        with torch.no_grad():
            epoch_losses = []
            for inputs, labels in train_loader:
                inputs, labels = inputs.float().to(device), labels.float().to(device)
                outputs = model(inputs)
                epoch_loss = criterion(outputs.view(-1), labels)
                epoch_losses.append(epoch_loss.item())
            mse_history.append(np.mean(epoch_losses))

    return model, mse_history

def create_polynomial_features(data, degree=3):
    poly = PolynomialFeatures(degree)
    return poly.fit_transform(data)

def evaluate_model(model, test_loader, device):
    model.eval()
    criterion = torch.nn.MSELoss()
    total_loss = 0.0
    total_count = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.float().to(device)
            labels = labels.float().to(device)
            outputs = model(inputs)
            clipped_outputs = clip_predictions(outputs.cpu().numpy())
            loss = criterion(torch.tensor(clipped_outputs).float().to(device), labels)
            total_loss += loss.item() * inputs.size(0)
            total_count += inputs.size(0)
            all_predictions.extend(clipped_outputs)
            all_labels.extend(labels.cpu().numpy())

    average_loss = total_loss / total_count
    r2 = r2_score(all_labels, all_predictions)
    return average_loss, r2

def clip_predictions(predictions, min_val=1, max_val=5):
    return np.clip(predictions, min_val, max_val)

In [None]:
# Load JSON data
with open('../processed_data/restaurant-topic-sentiment.json') as file:
    json_data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame.from_dict(json_data, orient='index')

# Shuffle the DataFrame
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Convert DataFrame to numpy array
data = df.to_numpy()

# K-Fold Cross-Validation with Hyperparameter Tuning
best_performance = float('inf')
best_hyperparams = {'learning_rate': None, 'l1_lambda': None, 'epochs': None}

k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Example hyperparameters to test
learning_rates = [0.0001, 0.00005, 0.00001, 0.000005]
l1_lambdas = [5, 1, .1, .01]
epoch_options = [1000, 500, 100]
input_dim = create_polynomial_features(data[:, :-1]).shape[1]  # Update based on polynomial features
output_dim = 1

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for lr in learning_rates:
    for l1_lambda in l1_lambdas:
        for epochs in epoch_options:
            fold_performances = []
            fold_r2_scores = []
            skip_hyperparameters = False
            print(f"Testing learning rate: {lr}, L1 lambda: {l1_lambda}, Epochs: {epochs}")

            for fold, (train_ids, test_ids) in enumerate(kf.split(data)):
                print(f"Starting Fold {fold + 1}/{k_folds}")

                poly_data = create_polynomial_features(data[:, :-1])
                poly_target = data[:, -1]

                train_dataset = CustomDataset(np.column_stack((poly_data[train_ids], poly_target[train_ids])))
                test_dataset = CustomDataset(np.column_stack((poly_data[test_ids], poly_target[test_ids])))
                train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
                test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

                model = LinearRegression(input_dim, output_dim).to(device)
                model, mse_history = train_model_lasso(model, train_loader, lr, epochs, l1_lambda, device)

                fold_performance, fold_r2 = evaluate_model(model, test_loader, device)

                if fold_r2 < 0:
                    print(f"Fold {fold + 1} has a negative R^2: {fold_r2}, skipping these hyperparameters.")
                    skip_hyperparameters = True
                    break

                weights = model.linear.weight.data.cpu().numpy().flatten()
                print(f"Weights for Fold {fold + 1}: {weights}")

                print(f"Fold {fold + 1}, MSE: {fold_performance}, R^2: {fold_r2}")
                fold_performances.append(fold_performance)
                fold_r2_scores.append(fold_r2)

            if skip_hyperparameters:
                continue
            
            avg_performance = np.mean(fold_performances)
            avg_r2 = np.mean(fold_r2_scores)
            print(f"Average MSE: {avg_performance}, Average R^2: {avg_r2} for learning rate: {lr}, L1 lambda: {l1_lambda}, Epochs: {epochs}\n")

            if avg_performance < best_performance:
                best_performance = avg_performance
                best_hyperparams['learning_rate'] = lr
                best_hyperparams['l1_lambda'] = l1_lambda
                best_hyperparams['epochs'] = epochs

print(f"Best Hyperparameters: Learning Rate: {best_hyperparams['learning_rate']}, L1 lambda: {best_hyperparams['l1_lambda']}, Epochs: {best_hyperparams['epochs']}, with MSE: {best_performance}")

# Train the final model on the entire dataset with the best hyperparameters
print("Training final model on the entire dataset with the best hyperparameters...")
poly_data = create_polynomial_features(data[:, :-1])
poly_target = data[:, -1]
full_data_combined = np.column_stack((poly_data, poly_target))
full_dataset = CustomDataset(full_data_combined)
full_loader = DataLoader(full_dataset, batch_size=64, shuffle=True)

final_model = LinearRegression(input_dim, output_dim).to(device)
final_model, _ = train_model_lasso(final_model, full_loader, best_hyperparams['learning_rate'], best_hyperparams['epochs'], best_hyperparams['l1_lambda'], device)

final_mse, final_r2 = evaluate_model(final_model, full_loader, device)
print(f"Final Model MSE: {final_mse}, Final Model R^2: {final_r2}")

# Save the final model
torch.save(final_model.state_dict(), 'final_model.pt')
print("Final model saved as 'final_model.pt'")

now that this analysis has been done, lets do a bit more tuning.

In [29]:
# Load JSON data
with open('../processed_data/restaurant-topic-sentiment.json') as file:
    json_data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame.from_dict(json_data, orient='index')

# Shuffle the DataFrame
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Convert DataFrame to numpy array
data = df.to_numpy()

# K-Fold Cross-Validation with Hyperparameter Tuning
best_performance = float('inf')
best_hyperparams = {'learning_rate': None, 'l1_lambda': None, 'epochs': None}

k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Example hyperparameters to test
learning_rates = [0.000125, 0.0001, 0.000075]
l1_lambdas = [.0125, .01, 0.005]
epoch_options = [1500, 1250, 1000, 750]
input_dim = create_polynomial_features(data[:, :-1]).shape[1]  # Update based on polynomial features
output_dim = 1

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

for lr in learning_rates:
    for l1_lambda in l1_lambdas:
        for epochs in epoch_options:
            fold_performances = []
            fold_r2_scores = []
            skip_hyperparameters = False
            print(f"Testing learning rate: {lr}, L1 lambda: {l1_lambda}, Epochs: {epochs}")

            for fold, (train_ids, test_ids) in enumerate(kf.split(data)):
                print(f"Starting Fold {fold + 1}/{k_folds}")

                poly_data = create_polynomial_features(data[:, :-1])
                poly_target = data[:, -1]

                train_dataset = CustomDataset(np.column_stack((poly_data[train_ids], poly_target[train_ids])))
                test_dataset = CustomDataset(np.column_stack((poly_data[test_ids], poly_target[test_ids])))
                train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
                test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

                model = LinearRegression(input_dim, output_dim).to(device)
                model, mse_history = train_model_lasso(model, train_loader, lr, epochs, l1_lambda, device)

                fold_performance, fold_r2 = evaluate_model(model, test_loader, device)

                if fold_r2 < 0:
                    print(f"Fold {fold + 1} has a negative R^2: {fold_r2}, skipping these hyperparameters.")
                    skip_hyperparameters = True
                    break

                weights = model.linear.weight.data.cpu().numpy().flatten()
                print(f"Weights for Fold {fold + 1}: {weights}")

                print(f"Fold {fold + 1}, MSE: {fold_performance}, R^2: {fold_r2}")
                fold_performances.append(fold_performance)
                fold_r2_scores.append(fold_r2)

            if skip_hyperparameters:
                continue
            
            avg_performance = np.mean(fold_performances)
            avg_r2 = np.mean(fold_r2_scores)
            print(f"Average MSE: {avg_performance}, Average R^2: {avg_r2} for learning rate: {lr}, L1 lambda: {l1_lambda}, Epochs: {epochs}\n")

            if avg_performance < best_performance:
                best_performance = avg_performance
                best_hyperparams['learning_rate'] = lr
                best_hyperparams['l1_lambda'] = l1_lambda
                best_hyperparams['epochs'] = epochs

if best_hyperparams['learning_rate'] is None or best_hyperparams['l1_lambda'] is None or best_hyperparams['epochs'] is None:
    print("No valid hyperparameter configuration found. Please check the cross-validation process.")
else:
    print(f"Best Hyperparameters: Learning Rate: {best_hyperparams['learning_rate']}, L1 lambda: {best_hyperparams['l1_lambda']}, Epochs: {best_hyperparams['epochs']}, with MSE: {best_performance}")

    # Train the final model on the entire dataset with the best hyperparameters
    print("Training final model on the entire dataset with the best hyperparameters...")
    poly_data = create_polynomial_features(data[:, :-1])
    poly_target = data[:, -1]
    full_data_combined = np.column_stack((poly_data, poly_target))
    full_dataset = CustomDataset(full_data_combined)
    full_loader = DataLoader(full_dataset, batch_size=32, shuffle=True)

    final_model = LinearRegression(input_dim, output_dim).to(device)
    final_model, _ = train_model_lasso(final_model, full_loader, best_hyperparams['learning_rate'], best_hyperparams['epochs'], best_hyperparams['l1_lambda'], device)

    final_mse, final_r2 = evaluate_model(final_model, full_loader, device)
    print(f"Final Model MSE: {final_mse}, Final Model R^2: {final_r2}")

    # Save the final model
    torch.save(final_model.state_dict(), 'final_model.pt')
    print("Final model saved as 'final_model.pt'")

Testing learning rate: 0.000125, L1 lambda: 0.0125, Epochs: 1500
Starting Fold 1/5
Weights for Fold 1: [ 1.86488736e+00  6.08570516e-01  2.70953983e-01  3.09987187e-01
 -1.43625539e-06  2.28795943e-06 -2.67704159e-01 -9.54102352e-02
  5.53331461e-07  3.54491533e-07 -1.08563086e-06 -2.80149654e-02
  2.04298249e-05  2.53101098e-06 -1.75438868e-06  3.08885842e-06
 -2.03103870e-01 -1.80474177e-04 -4.02957681e-07 -1.07573317e-06
  1.53346821e-06  1.70141021e-07 -7.76746049e-07  7.31715417e-08
  1.98961675e-06  6.52676135e-07  7.35621484e-07 -2.79347631e-07
  1.01868693e-06 -6.22223979e-07 -4.48889432e-06 -1.03460286e-06
 -6.39380175e-07 -1.09180110e-06  4.95500444e-07  1.72328413e-07
 -1.56809055e-08 -2.15513921e-07 -8.83184384e-07 -9.06130253e-07
  1.37613233e-06  9.49040611e-07 -1.28564943e-06  7.37879191e-07
 -7.96937002e-07 -1.80884768e-07  8.03664832e-08 -1.48903302e-06
  1.30992339e-06  9.32697219e-07 -1.34517862e-07 -2.61121809e-01
 -8.43880884e-03  7.29521275e-07  1.17852210e-06  1.

  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)


Fold 2 has a negative R^2: -inf, skipping these hyperparameters.
Testing learning rate: 0.000125, L1 lambda: 0.0125, Epochs: 1250
Starting Fold 1/5
Weights for Fold 1: [ 1.84914565e+00  6.33663654e-01  3.60089839e-01  3.02583605e-01
  2.23391544e-06  1.92322295e-06 -2.09738493e-01 -7.66405314e-02
  2.06253148e-07  2.52073363e-07  2.61628713e-07 -3.98061201e-02
  4.78693255e-06  1.30523028e-06  2.04140906e-06  4.69153150e-07
 -1.69761255e-01 -2.87550502e-04 -6.73719796e-07  1.41898204e-06
 -4.17890224e-05 -1.12798898e-06  1.34034269e-06  1.68707265e-06
  5.33444165e-07 -5.39282155e-07  1.38769974e-06 -1.13291883e-07
 -1.27219221e-06  1.54071245e-06  1.33222863e-06 -1.05788075e-07
 -7.54634527e-07 -4.77813273e-06  3.21934692e-07 -2.73536841e-07
  3.05433510e-08 -1.18670846e-06 -1.23527627e-06 -1.03983245e-06
 -2.92412409e-07  1.59489377e-06 -1.41916905e-06 -8.83152893e-07
 -1.98626140e-07  1.56156204e-06  1.25023951e-06 -9.07858805e-07
  8.94617358e-07 -1.33842730e-06 -1.22774009e-06 -2.

  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)


Fold 2 has a negative R^2: -inf, skipping these hyperparameters.
Testing learning rate: 0.000125, L1 lambda: 0.0125, Epochs: 1000
Starting Fold 1/5
Weights for Fold 1: [ 1.80732059e+00  6.46604121e-01  3.75758618e-01  2.62928605e-01
  1.04672665e-06  2.71189742e-06 -1.72098041e-01 -6.25693277e-02
  2.19792491e-06 -2.23173060e-06  2.06648451e-06 -4.45702188e-02
  1.47717036e-02  1.34968059e-05  2.12163698e-07  8.78403853e-07
 -1.39377818e-01 -3.63331637e-03  5.56449152e-07 -1.90506842e-06
  5.21122274e-06  3.15516991e-06  3.21030302e-06  2.58225896e-07
  1.17346595e-07  5.18208117e-06  5.28001522e-07 -1.56220366e-07
  1.38144287e-07 -1.66048380e-07  3.98428589e-02  1.37480356e-06
  1.43135378e-06  2.86093041e-06 -1.38207236e-06  9.81968014e-07
  6.15613658e-07  1.25907161e-06  3.91471872e-07 -9.83505743e-07
  1.29697901e-06  1.35061657e-06  7.66000710e-07 -8.72811938e-07
  3.04496808e-07 -1.21753988e-06 -1.10905989e-06  1.44552280e-06
  1.32619823e-06  4.01664701e-09 -2.83441153e-07 -2.

  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)


Fold 2 has a negative R^2: -inf, skipping these hyperparameters.
Testing learning rate: 0.000125, L1 lambda: 0.01, Epochs: 1250
Starting Fold 1/5
Weights for Fold 1: [ 1.83210349e+00  6.85163260e-01  3.63602757e-01  3.08724850e-01
  1.52514463e-06 -4.11043942e-07 -2.16452181e-01 -8.19618776e-02
 -1.50662260e-07  2.41660615e-07 -1.29659395e-06 -6.62286952e-02
  6.47654701e-07 -8.10646725e-06  1.18562434e-06 -2.00517457e-06
 -1.99284256e-01 -1.43610975e-02 -1.61704781e-06  7.49423918e-07
 -1.92890738e-07  2.51450520e-02 -3.93513858e-07  1.23842619e-06
 -8.86350904e-08 -6.26366386e-07 -5.07984453e-07 -4.88051285e-07
 -6.57796875e-07  3.69339631e-07 -1.71357760e-06 -8.85280883e-07
 -1.69902410e-06 -1.93017058e-06 -7.77665775e-07 -1.33531807e-06
  7.94167534e-07 -1.22708627e-07 -5.71107933e-07  1.19364631e-06
  5.80694348e-07  8.68399866e-07 -7.04623858e-07 -1.15321859e-06
 -1.77988042e-08 -9.99864824e-07  4.97585063e-07  8.85298164e-08
  3.57748974e-07 -1.18395735e-06 -1.11530778e-06 -2.60

  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)


Fold 2 has a negative R^2: -inf, skipping these hyperparameters.
Testing learning rate: 0.000125, L1 lambda: 0.01, Epochs: 1000
Starting Fold 1/5
Weights for Fold 1: [ 1.84437811e+00  6.76043630e-01  3.27275187e-01  2.91846782e-01
 -8.80526500e-07  3.89423940e-06 -1.83104634e-01 -5.10015562e-02
 -5.49317519e-06 -1.51266579e-06 -8.40253904e-07 -6.31671622e-02
  2.69643060e-05  9.04519949e-03 -5.18021409e-07  7.80697883e-06
 -1.62414551e-01 -3.26104872e-02 -2.69174961e-06 -5.57163503e-07
 -2.77706334e-07  4.99159582e-02 -7.32369585e-07 -1.34683988e-08
 -6.32817375e-07 -4.76186460e-06 -4.35866377e-06 -3.24185430e-06
 -1.17188915e-06 -1.13374597e-06  1.20771683e-05 -2.78124361e-07
  7.89304750e-07 -2.19638855e-06 -1.89075342e-06 -2.13222916e-06
  9.04622368e-07  5.61750880e-07 -8.49976971e-07  9.51970947e-07
  5.36442997e-07 -1.12596729e-06 -1.20537834e-06  1.82768574e-07
  2.89396894e-07  2.94817060e-08 -4.84575722e-08 -7.24863185e-07
  1.20835750e-07  5.17044498e-07  7.34644701e-08 -2.28

  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)


Fold 2 has a negative R^2: -inf, skipping these hyperparameters.
Testing learning rate: 0.000125, L1 lambda: 0.005, Epochs: 1250
Starting Fold 1/5
Weights for Fold 1: [ 1.81992924e+00  6.79008842e-01  3.53501916e-01  3.28725070e-01
 -1.96062881e-08  3.26014422e-02 -2.30734795e-01 -7.99282715e-02
 -5.65452820e-06 -1.00889054e-06 -3.59760179e-06 -7.70530999e-02
  5.35329171e-02  1.68029591e-02 -4.25009006e-07  5.43862552e-06
 -1.89057544e-01 -6.40588030e-02 -1.13010697e-06 -1.93775895e-06
 -9.93466750e-03  5.62260263e-02  2.84702331e-02 -2.02098747e-07
 -8.74903883e-09 -3.61814052e-02  4.71682853e-07 -5.33722073e-07
 -4.00668796e-07 -6.79444270e-07  4.35033664e-02 -3.32762170e-07
 -3.52627410e-07 -1.77916138e-06  5.20496371e-07 -4.55481398e-07
 -1.03231923e-06 -1.81238121e-07  2.11633221e-07 -8.22218169e-07
 -5.53065149e-07  4.20407460e-08 -7.64167680e-07 -1.40169561e-07
  2.09645975e-07 -2.14894570e-07 -3.22593735e-07 -6.06504358e-08
 -7.94440382e-07 -8.67793347e-07 -2.21179377e-07 -2.8

  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)


Fold 2 has a negative R^2: -inf, skipping these hyperparameters.
Testing learning rate: 0.000125, L1 lambda: 0.005, Epochs: 1000
Starting Fold 1/5
Weights for Fold 1: [ 1.82232487e+00  6.79462373e-01  3.94835085e-01  3.21683496e-01
  5.97579069e-02  3.39696668e-02 -1.51779443e-01 -5.06429188e-02
 -2.14704946e-02 -1.51837401e-06 -1.33362028e-05 -7.83379301e-02
  9.40277353e-02  1.96852237e-02  1.68186602e-07  6.55831946e-06
 -1.54275924e-01 -5.18754497e-02 -1.33354456e-06  3.81074194e-09
 -1.70311687e-04  1.79823712e-02  4.68256511e-02 -4.91579499e-07
  3.36580428e-07 -1.49740567e-02 -8.17852936e-07 -4.89725664e-07
 -7.03910246e-07 -1.79017562e-07  3.60356867e-02 -2.40322407e-07
 -1.63269576e-08 -3.98202706e-03 -8.71377324e-07 -4.18038809e-07
 -4.88492381e-03  2.41107045e-07 -2.82563178e-07 -2.72790629e-07
 -3.96362552e-03 -3.76833299e-07  5.58582769e-07 -4.67922035e-07
 -3.08855846e-07  5.72633553e-07 -6.27322549e-08 -2.57707969e-03
  3.69101258e-07 -1.51816323e-07  3.39244707e-07 -2.4

We got a good baseline model, but the polynomial model takes a long time to train, and doesn't seem to have better performance, so we are going to abandon using this model for now.