# Regression in PyTorch

In [None]:
import matplotlib.pyplot as plt # for plots
import numpy as np # for generating data and working with tensors
import pandas as pd # pandas for reading in the csv data and visualizing it
import torch # PyTorch for building and training the network
import torch.nn as nn # for building the model architecture
import torch.optim as optim # for getting an optimizer to update the network weights
from sklearn.model_selection import train_test_split # for splitting the dataset into train and test sets

## Simple Linear Regression

### Generating the data

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
num_samples = 100  # Number of data points
duration = np.random.uniform(10, 120, num_samples)  # Duration between 10 to 120 minutes

# Define calories burned with a linear relationship and some noise
calories_burned = duration * 8 + np.random.normal(0, 10, num_samples)  # Noise with std deviation of 10

# Create a DataFrame to store the dataset
data = pd.DataFrame({
    'Duration (minutes)': duration,
    'Calories Burned': calories_burned
})

# Plot the synthetic data
plt.scatter(data['Duration (minutes)'], data['Calories Burned'], color='b', label='Data points')
plt.xlabel('Duration (minutes)')
plt.ylabel('Calories Burned')
plt.title('Calories Burned vs. Exercise Duration')
plt.legend()
plt.show()
data.head()

### Building the model

In [None]:
# TODO: create the structure of the PyTorch model

# Define and initialize the model
# model = LinearRegressionModel()
# model

### Training the model

In [None]:
# Convert data to PyTorch tensors
X = torch.tensor(data['Duration (minutes)'], dtype=torch.float32).view(-1, 1)
y = torch.tensor(data['Calories Burned'], dtype=torch.float32).view(-1, 1)

# Normalize the tensors
X_mean, X_std = X.mean(), X.std()
y_mean, y_std = y.mean(), y.std()
X = (X - X_mean) / X_std
y = (y - y_mean) / y_std


# Define the loss function and the optimizer
# TODO: criterion = 
# TODO: optimizer = 

# TODO: Training loop

# Convert the mean and std tensors to numpy for scaling
y_std = y_std.item()
y_mean = y_mean.item()

# Unnormalize the predictions
predicted = model(X).detach().numpy() * y_std + y_mean

# Plot the results using original data
plt.scatter(data['Duration (minutes)'], data['Calories Burned'], color='blue', label='Original Data')
plt.plot(data['Duration (minutes)'], predicted, color='red', label='Fitted Line')
plt.xlabel('Duration (minutes)')
plt.ylabel('Calories Burned')
plt.legend()
plt.show()

In [None]:
# Convert data to PyTorch tensors
X = torch.tensor(data['Duration (minutes)'], dtype=torch.float32).view(-1, 1)
y = torch.tensor(data['Calories Burned'], dtype=torch.float32).view(-1, 1)

# Normalize the tensors
X_mean, X_std = X.mean(), X.std()
y_mean, y_std = y.mean(), y.std()
X = (X - X_mean) / X_std
y = (y - y_mean) / y_std

# TODO: Split data into 80% train and 20% test
X_train, X_test, y_train, y_test = None, None, None, None

# Define the loss function and the optimizer
# TODO: criterion =
# TODO: optimizer =

# TODO: Training loop

# TODO: Evaluate on the test set
predictions_train = None

# Unnormalize test and training points for plotting
X_train_unnorm = X_train * X_std + X_mean
y_train_unnorm = y_train * y_std + y_mean
X_test_unnorm = X_test * X_std + X_mean
y_test_unnorm = y_test * y_std + y_mean

# Plot the results
plt.scatter(X_train_unnorm.numpy(), y_train_unnorm.numpy(), color='blue', label='Training Data')
plt.scatter(X_test_unnorm.numpy(), y_test_unnorm.numpy(), color='green', label='Test Data')
plt.plot(X_train_unnorm.numpy(), predictions_train, color='red', label='Fitted Line (Train)')
plt.xlabel('Duration (minutes)')
plt.ylabel('Calories Burned')
plt.legend()
plt.show()

## Multi Regression - Nutrients Data

### Reading in the data

In [None]:
# read in the data
nutrients_dataframe = pd.read_csv('../Data/nutrients.csv')
# print first 5 rows
nutrients_dataframe.head(5)

### Preparing the data as input

We are training a neural network to predict the energy of a food (basically calories) based on the other contents of the food. We don't want to use the food name since that would not be helpful for predicting the energy of food that is not in our dataset.

In [None]:
# train_data_input, train_data_
X = nutrients_dataframe[['Protein', 'Fat', 'Calcium', 'Iron']].values
y = nutrients_dataframe[['Energy']].values
# TODO: split data
X_train, X_test, y_train, y_test = 

# saving for getting the rows from the test set for test set evaluation
X_test_split = X_test
y_test_split = y_test

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

### Defining the model architecture

In [None]:
# TODO: define model architecture
# model = MultiRegressionModel()

### Training the model

In [None]:
def train_model(model):
	# TODO: Choose mean squared error as the loss function
	# criterion = 

	# TODO: Choose Adam to optimize the neural net weights
	# optimizer = 

	# TODO: Training loop
# train_model(model)

### Check predictions compared to actual for the whole dataset

In [None]:
# TODO: Predicting using the trained model on the entire dataset (X)
all_predictions = 

# Append predictions to the dataframe as a new column
nutrients_dataframe['Model 1 Predicted Energy'] = all_predictions

# Print the updated dataframe with predictions
nutrients_dataframe.head()

In [None]:
# Determine the min and max of both actual and predicted energies
actual_min = min(nutrients_dataframe['Energy'])
actual_max = max(nutrients_dataframe['Energy'])
predicted_min = min(nutrients_dataframe['Model 1 Predicted Energy'])
predicted_max = max(nutrients_dataframe['Model 1 Predicted Energy'])

# Use the bounds of both actual and predicted energies for the line of perfect prediction
plt.plot([min(actual_min, predicted_min), max(actual_max, predicted_max)],
         [min(actual_min, predicted_min), max(actual_max, predicted_max)],
         color='red', linestyle='--')

# Plot actual vs predicted values
plt.scatter(nutrients_dataframe['Energy'], nutrients_dataframe['Model 1 Predicted Energy'], color='blue')
plt.xlabel('Actual Energy')
plt.ylabel('Predicted Energy')
plt.title('Actual vs Predicted Energy - Model 1 on Entire Dataset')
plt.show()

### Evaluate the model on the test set

In [None]:
# TODO: Make predictions on the test set
test_predictions = None

# Convert predictions to numpy for easier manipulation
test_predictions = test_predictions.numpy()

# Create a DataFrame with actual values (y_test) and predicted values
results_df = pd.DataFrame({
    'Actual Energy': y_test.flatten(),  # Flatten to match the shape of predictions
    'Model 1 Predicted Energy': test_predictions.flatten()
})

# include the input features from the test set
X_test_df = pd.DataFrame(X_test_split, columns=['Protein', 'Fat', 'Calcium', 'Iron'])
test_results_df = pd.concat([X_test_df, results_df], axis=1)
test_results_df


In [None]:
# Determine the min and max of both actual and predicted energies
actual_min = min(test_results_df['Actual Energy'])
actual_max = max(test_results_df['Actual Energy'])
predicted_min = min(test_results_df['Model 1 Predicted Energy'])
predicted_max = max(test_results_df['Model 1 Predicted Energy'])

# Use the bounds of both actual and predicted energies for the line of perfect prediction
plt.plot([min(actual_min, predicted_min), max(actual_max, predicted_max)],
         [min(actual_min, predicted_min), max(actual_max, predicted_max)],
         color='red', linestyle='--')

# Plot actual vs predicted values
plt.scatter(test_results_df['Actual Energy'], test_results_df['Model 1 Predicted Energy'], color='blue')
plt.xlabel('Actual Energy')
plt.ylabel('Predicted Energy')
plt.title('Actual vs Predicted Energy - Model 1 on Test Set')
plt.show()

## Bigger Model

In [None]:
# TODO: make a bigger model
model = MultiRegressionModel2()

### Train model

In [None]:
train_model(model)

### Evaluate performance on test set

In [None]:
# Predicting using the trained model on the entire dataset (X)
model.eval()
with torch.no_grad():
    all_predictions = model(torch.tensor(X, dtype=torch.float32)).numpy()

# Append predictions to the dataframe as a new column
nutrients_dataframe['Model 2 Predicted Energy'] = all_predictions

# Print the updated dataframe with predictions
nutrients_dataframe.head()

In [None]:
# Determine the min and max of both actual and predicted energies
predicted_min = min(nutrients_dataframe['Model 2 Predicted Energy'])
predicted_max = max(nutrients_dataframe['Model 2 Predicted Energy'])

# Use the bounds of both actual and predicted energies for the line of perfect prediction
plt.plot([min(actual_min, predicted_min), max(actual_max, predicted_max)],
         [min(actual_min, predicted_min), max(actual_max, predicted_max)],
         color='red', linestyle='--')

# Plot actual vs predicted values
plt.scatter(nutrients_dataframe['Energy'], nutrients_dataframe['Model 2 Predicted Energy'], color='blue')
plt.xlabel('Actual Energy')
plt.ylabel('Predicted Energy')
plt.title('Actual vs Predicted Energy - Model 2 on Entire Dataset')
plt.show()

### Evaluate Model 2 on Test Set

In [None]:
# Make predictions on the test set
model.eval()
with torch.no_grad():
    test_predictions = model(X_test)

# Convert predictions to numpy for easier manipulation
test_predictions = test_predictions.numpy()
test_results_df['Model 2 Predicted Energy'] = test_predictions.flatten()
test_results_df

In [None]:
# Determine the min and max of both actual and predicted energies
predicted_min = min(test_results_df['Model 2 Predicted Energy'])
predicted_max = max(test_results_df['Model 2 Predicted Energy'])

# Use the bounds of both actual and predicted energies for the line of perfect prediction
plt.plot([min(actual_min, predicted_min), max(actual_max, predicted_max)],
         [min(actual_min, predicted_min), max(actual_max, predicted_max)],
         color='red', linestyle='--')

# Plot actual vs predicted values
plt.scatter(test_results_df['Actual Energy'], test_results_df['Model 2 Predicted Energy'], color='blue')
plt.xlabel('Actual Energy')
plt.ylabel('Predicted Energy')
plt.title('Actual vs Predicted Energy - Model 2 on Test Set')
plt.show()