In [25]:
# Imports for all models
import torch
import torch.optim as optim
import pandas as pd
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt

from tqdm import tqdm
from dataloader import CustomDataloader, CustomImageDataloader
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from networks import FullyConnectedNetwork, CNN

# Load the processed data from the CSV file
data_dict = torch.load('data/ProcessedData.pt')
file_names_dict = torch.load('data/ProcessedDataFileNames.pt')



Below is the code for the first model. Using the CSV files generated from the dataset_processed notebook, we train our model. A training curve graph is generated once the model is complete and saved within the directory to then later be displayed in the analysis notebook.


In [26]:

# Data loaders for training, validation and testing
train_dataloader = CustomDataloader(data_dict['x_train'],data_dict['y_train'], batch_size = 32, randomize=True)
val_dataloader = CustomDataloader(data_dict['x_val'],data_dict['y_val'], batch_size = 64)
test_dataloader = CustomDataloader(data_dict['x_test'],data_dict['y_test'], batch_size = 64)

# Convert the data to numpy arrays
x_train = data_dict['x_train'].numpy()
y_train = data_dict['y_train'].numpy()
x_val = data_dict['x_val'].numpy()
y_val = data_dict['y_val'].numpy()
x_test = data_dict['x_test'].numpy()
y_test = data_dict['y_test'].numpy()


# Initialize the model
model = LinearRegression()

# Lists to store training and validation losses
train_losses = []
val_losses = []

predictions = []
x_values = []
indices = []

# Define a range of training set sizes for learning curve
training_set_sizes = [int(len(x_train) * fraction) for fraction in np.linspace(0.1, 1.0, 10)]


for size in training_set_sizes:
    # Fit the model on a subset of the training data
    model.fit(x_train[:size], y_train[:size])
    
    # Predict on the entire training set
    y_train_pred = model.predict(x_train)
    
    # Calculate training loss and append to train_losses
    train_loss = mean_squared_error(y_train[:len(y_train_pred)], y_train_pred)
    train_losses.append(train_loss)
    
    # Predict on the validation set
    y_val_pred = model.predict(x_val)
    
    # Calculate validation loss and append to val_losses
    val_loss = mean_squared_error(y_val, y_val_pred)
    val_losses.append(val_loss)
    
    # Store predictions, x values, and indices
    predictions.extend(y_train_pred)
    x_values.extend(x_train[:len(y_train_pred)])
    indices.extend(range(len(y_train_pred)))

# Calculate final validation loss
final_val_loss = mean_absolute_error(y_val, y_val_pred)
print(f"Final validation loss: {final_val_loss}")

random_baseline = np.random.uniform(min(y_train), max(y_train), size=len(y_train))
# Calculate mean squared error (MSE) for the random baseline
random_baseline_mse = mean_squared_error(y_train, random_baseline)

print(f"Random Baseline MSE: {random_baseline_mse}")

# Create a DataFrame to store the data
data = {'Index': indices, 'X_Value': x_values, 'Prediction': predictions}
df = pd.DataFrame(data)

# Save DataFrame to a CSV file
df.to_csv('predictions.csv', index=False)

# Create a plot of the learning curve and save it for later use
plt.figure(figsize=(8, 6))
plt.plot(training_set_sizes, train_losses, label='Training loss')
plt.plot(training_set_sizes, val_losses, label='Validation loss')
plt.xlabel('Training Set Size')
plt.ylabel('Mean Squared Error')
plt.title('Learning Curve')
plt.legend()
plt.grid(True)
plt.savefig('learning_curve_model_1.png')
plt.close()





Final validation loss: 6.3121137619018555
Random Baseline MSE: 2134.2127702406337


Below is the code for model 2. Currently it is mostly commented out due to it not being able to actually train the model. The code is commented out in order to provide my thought process and at which point did I have to stop at while working on this assignment.


In [27]:
# Define image directory and file names
images_directory = 'data/images'
x_test_file_names = 'x_test_file_names.csv'
x_train_file_names = 'x_train_file_names.csv'
x_val_file_names = 'x_val_file_names.csv'

# Dataloaders for training, validation and testing
test_dataloader = CustomImageDataloader(x_test_file_names, images_directory, batch_size = 64)
train_dataloader = CustomImageDataloader(x_train_file_names, images_directory, batch_size = 32)
val_dataloader = CustomImageDataloader(x_val_file_names, images_directory, batch_size = 64)

# Define the neural network
model = CNN(10)

# Define the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Your training and validation loop
train_losses = []
val_losses = []
epochs = 20

# Training loop
# for epoch in tqdm(range(epochs)):
#     train_epoch_losses = []
#     for _ in range(train_dataloader.num_batches_per_epoch):
#         train_batch = train_dataloader.fetch_batch()
        
#         # Transpose the input data to match the model's expected shape [batch_size, 3, 128, 128]
#         train_batch['images'] = train_batch['images'].permute(0, 3, 1, 2).float()
        
#         optimizer.zero_grad()  # Zero the gradients
#         output = model(train_batch['images'])  # Forward pass
#         loss = criterion(output, train_batch['batch_idx'])  # Compute the loss
#         loss.backward()  # Backpropagation
#         optimizer.step()  # Optimize
#         train_epoch_losses.append(loss.item())
#     train_losses.append(np.mean(train_epoch_losses))

#     val_epoch_losses = []
#     for _ in range(val_dataloader.num_batches_per_epoch):
#         val_batch = val_dataloader.fetch_batch()
        
#         # Transpose the validation input data to match the model's expected shape [batch_size, 3, 128, 128]
#         val_batch['images'] = val_batch['images'].permute(0, 3, 1, 2).float()
        
#         with torch.no_grad():
#             output = model(val_batch['images'])  # Forward pass
#             val_loss = criterion(output, val_batch['batch_idx'])  # Compute the loss
#             val_epoch_losses.append(val_loss.item())
#     val_losses.append(np.mean(val_epoch_losses))


# Create a plot of the learning curve and save it for later use
# plt.plot(range(1, epochs + 1), train_losses, label='Training Loss')
# plt.plot(range(1, epochs + 1), val_losses, label='Validation Loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.title('Learning Curve')
# plt.legend()
# plt.savefig('learning_curve_model_2.png')



