In [1]:
import pandas as pd

In [2]:
# Loading data
data = pd.read_csv('Prodigy University Dataset.csv')
# Split the data into features (X) and target (y)
data.head()

Unnamed: 0,sat_sum,hs_gpa,fy_gpa
0,508,3.4,3.18
1,488,4.0,3.33
2,464,3.75,3.25
3,380,3.75,2.42
4,428,4.0,2.63


In [44]:
# Converting data to numpy
X = data[['sat_sum', 'hs_gpa']].values
# reshape the fy_gpa into a 2D array with [data_size] rows and 1 column
y = data['fy_gpa'].values.reshape(-1, 1)
print(X.shape)
print(y.shape)

(1000, 2)
(1000, 1)


In [45]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
from sklearn.preprocessing import StandardScaler

# Normalize the features so that it is easier to train the data
scaler = StandardScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.fit_transform(X_test)

In [47]:
X_train.shape

(800, 2)

In [48]:
import torch
# Convert numpy to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [49]:
import torch.nn as nn

In [50]:
# Building model with 2 neurons
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)

In [51]:
# Forward Propagation
preds = model(X_train_tensor)

In [52]:
preds[:5]

tensor([[-0.4646],
        [-0.4758],
        [-0.5418],
        [-0.5063],
        [-0.5190]], grad_fn=<SliceBackward0>)

In [53]:
from torch.nn import MSELoss

In [54]:
# Calculating Loss
criterion = MSELoss()
loss = criterion(preds, y_train_tensor)
print(loss)
# to learners: You may get different values

tensor(9.4675, grad_fn=<MseLossBackward0>)


# Comparing predictions on X_train with Target

In [55]:
preds[:5]

tensor([[-0.4646],
        [-0.4758],
        [-0.5418],
        [-0.5063],
        [-0.5190]], grad_fn=<SliceBackward0>)

In [56]:
y_train_tensor[:5]

tensor([[2.0000],
        [3.1100],
        [1.6300],
        [3.0200],
        [1.5500]])

In [57]:
model[0].weight

Parameter containing:
tensor([[ 0.5959, -0.4843],
        [-0.0684,  0.0804]], requires_grad=True)

In [58]:
model[2].weight

Parameter containing:
tensor([[-0.1629,  0.4337]], requires_grad=True)

---

In [59]:
# One step of updating Weights

In [60]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr = 0.001)

In [61]:
loss.backward()
optimizer.step()

In [62]:
model[0].weight

Parameter containing:
tensor([[ 0.5959, -0.4844],
        [-0.0684,  0.0805]], requires_grad=True)

In [63]:
model[2].weight

Parameter containing:
tensor([[-0.1591,  0.4370]], requires_grad=True)

In [64]:
from torch.utils.data import TensorDataset, DataLoader

In [65]:
train_data = TensorDataset(X_train_tensor, y_train_tensor)

In [66]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.SGD(model.parameters(), lr = 0.001)

In [67]:
# performance on train  and test sets  before training
train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
print(f'Without Training:\nTrain Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Without Training:
Train Loss: 7.4413, Test Loss: 7.7175


In [68]:
# Looking at predictions
model(X_train_tensor)[:5]

tensor([[-0.0348],
        [-0.0532],
        [-0.1549],
        [-0.1209],
        [-0.1016]], grad_fn=<SliceBackward0>)

# Stochastic Gradient Descent

In [69]:
train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
# Execute the training loop
for epoch in range(10):
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
    # print(epoch,': ', train_loss)
    test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
    print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 1: Train Loss: 0.6070, Test Loss: 0.6881
Epoch 2: Train Loss: 0.5161, Test Loss: 0.5760
Epoch 3: Train Loss: 0.5026, Test Loss: 0.5586
Epoch 4: Train Loss: 0.4893, Test Loss: 0.5449
Epoch 5: Train Loss: 0.4749, Test Loss: 0.5314
Epoch 6: Train Loss: 0.4598, Test Loss: 0.5170
Epoch 7: Train Loss: 0.4430, Test Loss: 0.4976
Epoch 8: Train Loss: 0.4280, Test Loss: 0.4848
Epoch 9: Train Loss: 0.4126, Test Loss: 0.4671
Epoch 10: Train Loss: 0.3995, Test Loss: 0.4549


In [70]:
# Looking at predictions
model(X_train_tensor)[:5]

tensor([[2.2735],
        [2.2837],
        [2.2860],
        [2.4414],
        [2.1852]], grad_fn=<SliceBackward0>)

# Batch Gradient Descent

In [71]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.SGD(model.parameters(), lr = 0.001)

In [72]:
train_loader = DataLoader(train_data, batch_size=800, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(1000): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 100 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 100: Train Loss: 2.8337, Test Loss: 3.0179
Epoch 200: Train Loss: 1.6479, Test Loss: 1.7949
Epoch 300: Train Loss: 1.0243, Test Loss: 1.1439
Epoch 400: Train Loss: 0.7023, Test Loss: 0.8022
Epoch 500: Train Loss: 0.5381, Test Loss: 0.6240
Epoch 600: Train Loss: 0.4549, Test Loss: 0.5309
Epoch 700: Train Loss: 0.4125, Test Loss: 0.4816
Epoch 800: Train Loss: 0.3905, Test Loss: 0.4548
Epoch 900: Train Loss: 0.3787, Test Loss: 0.4396
Epoch 1000: Train Loss: 0.3720, Test Loss: 0.4306


# Mini-Batch Gradient Descent

In [73]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.SGD(model.parameters(), lr = 0.001)

In [74]:
train_loader = DataLoader(train_data, batch_size= 64, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(500): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 50 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 50: Train Loss: 0.7423, Test Loss: 0.8265
Epoch 100: Train Loss: 0.6125, Test Loss: 0.6781
Epoch 150: Train Loss: 0.5482, Test Loss: 0.6098
Epoch 200: Train Loss: 0.4999, Test Loss: 0.5593
Epoch 250: Train Loss: 0.4634, Test Loss: 0.5215
Epoch 300: Train Loss: 0.4355, Test Loss: 0.4926
Epoch 350: Train Loss: 0.4142, Test Loss: 0.4707
Epoch 400: Train Loss: 0.3981, Test Loss: 0.4543
Epoch 450: Train Loss: 0.3858, Test Loss: 0.4421
Epoch 500: Train Loss: 0.3766, Test Loss: 0.4329


Run all cells till here

Let's quickly run the model using the new techniques we just looked at that is GD with Momentum and Nesterov Momentum. Let's begin with GD with momentum. 

# Gradient Descent with Momentum

In [75]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum=0.9)

Here, we've introduced a momentum of 0.9 to the SGD optimizer. Ensure that you add the momentum parameter to the optimizer else the model would simply use the basic SGD.

In [76]:
train_loader = DataLoader(train_data, batch_size= 64, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(500): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 50 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 50: Train Loss: 0.3498, Test Loss: 0.4025
Epoch 100: Train Loss: 0.3434, Test Loss: 0.4005
Epoch 150: Train Loss: 0.3424, Test Loss: 0.4011
Epoch 200: Train Loss: 0.3420, Test Loss: 0.4008
Epoch 250: Train Loss: 0.3417, Test Loss: 0.4011
Epoch 300: Train Loss: 0.3415, Test Loss: 0.4011
Epoch 350: Train Loss: 0.3413, Test Loss: 0.4009
Epoch 400: Train Loss: 0.3411, Test Loss: 0.4006
Epoch 450: Train Loss: 0.3409, Test Loss: 0.4008
Epoch 500: Train Loss: 0.3407, Test Loss: 0.4006


Here, we just observed how with a momentum 0.9 we reached to lower value of loss in a much faster manner! 

Let's quickly run the Nesterov Momentum on our dataset and evaluate the losses.

# Nesterov Momentum

The code is the almost the same as SGD with Momentum, but all you have to do is set the nesterov parameter to true.

In [77]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum=0.9, nesterov=True)

In [78]:
train_loader = DataLoader(train_data, batch_size= 64, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(500): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 50 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 50: Train Loss: 0.3506, Test Loss: 0.4093
Epoch 100: Train Loss: 0.3446, Test Loss: 0.4038
Epoch 150: Train Loss: 0.3433, Test Loss: 0.4023
Epoch 200: Train Loss: 0.3427, Test Loss: 0.4013
Epoch 250: Train Loss: 0.3423, Test Loss: 0.4010
Epoch 300: Train Loss: 0.3421, Test Loss: 0.4004
Epoch 350: Train Loss: 0.3418, Test Loss: 0.4007
Epoch 400: Train Loss: 0.3416, Test Loss: 0.4004
Epoch 450: Train Loss: 0.3414, Test Loss: 0.4001
Epoch 500: Train Loss: 0.3412, Test Loss: 0.3999


The loss calculations are almost at par with the GD with momentum. Our final train and test loss stand at VALUE and VALUE.

Things are getting interesting aren’t they. Feel free to revisit the concepts we have covered so far before we move to the next optimizer which is AdaGrad..


# AdaGrad

Next, let's quickly run each of the different optimizers we just looked at starting with AdaGrad. Note that here we've to specify optim.Adagead(model.parameters()) to intialize the model with Adagrad. If you recall, Adagrad adjusts the learning rates of each parameter based on the historical gradients. 

In [79]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.Adagrad(model.parameters())

In [80]:
train_loader = DataLoader(train_data, batch_size= 64, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(500): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 50 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 50: Train Loss: 0.7225, Test Loss: 0.8269
Epoch 100: Train Loss: 0.4534, Test Loss: 0.5315
Epoch 150: Train Loss: 0.3863, Test Loss: 0.4517
Epoch 200: Train Loss: 0.3679, Test Loss: 0.4270
Epoch 250: Train Loss: 0.3623, Test Loss: 0.4181
Epoch 300: Train Loss: 0.3603, Test Loss: 0.4146
Epoch 350: Train Loss: 0.3594, Test Loss: 0.4129
Epoch 400: Train Loss: 0.3589, Test Loss: 0.4121
Epoch 450: Train Loss: 0.3586, Test Loss: 0.4115
Epoch 500: Train Loss: 0.3583, Test Loss: 0.4112


Adagrad has given us a high intial loss but the final loss values of VALUE on the train data and VALUE on the test data. Next let's try RMSProp!

# RMS Prop

Just like we did with Adagrad earlier, here we need to use optim.RMSprop to intitialize the model with RMSProp. Even though we are going to use the default parameters, RMSprop also has parameters such as, learning rate, momentum etc, which you can feel free to try out! Let's run the code.

In [81]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.RMSprop(model.parameters())

In [82]:
train_loader = DataLoader(train_data, batch_size= 64, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(500): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 50 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 50: Train Loss: 0.3447, Test Loss: 0.4096
Epoch 100: Train Loss: 0.3380, Test Loss: 0.3993
Epoch 150: Train Loss: 0.3374, Test Loss: 0.3993
Epoch 200: Train Loss: 0.3374, Test Loss: 0.3975
Epoch 250: Train Loss: 0.3372, Test Loss: 0.4000
Epoch 300: Train Loss: 0.3376, Test Loss: 0.3975
Epoch 350: Train Loss: 0.3379, Test Loss: 0.4024
Epoch 400: Train Loss: 0.3380, Test Loss: 0.4030
Epoch 450: Train Loss: 0.3415, Test Loss: 0.4082
Epoch 500: Train Loss: 0.3371, Test Loss: 0.4011


With RMSProp we have clearly achieved our lowest loss values so far. We have got final loss values of VALUE on the train data and VALUE on the test data

Next, let's try the Adam Optimizer. For this, we need to use optim.Adam. Take a look at the code.

# Adam

In [83]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.Adam(model.parameters())

In [84]:
train_loader = DataLoader(train_data, batch_size= 64, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(500): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 50 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 50: Train Loss: 1.2104, Test Loss: 1.3366
Epoch 100: Train Loss: 0.4005, Test Loss: 0.4654
Epoch 150: Train Loss: 0.3653, Test Loss: 0.4176
Epoch 200: Train Loss: 0.3629, Test Loss: 0.4143
Epoch 250: Train Loss: 0.3604, Test Loss: 0.4123
Epoch 300: Train Loss: 0.3575, Test Loss: 0.4099
Epoch 350: Train Loss: 0.3545, Test Loss: 0.4078
Epoch 400: Train Loss: 0.3516, Test Loss: 0.4058
Epoch 450: Train Loss: 0.3491, Test Loss: 0.4044
Epoch 500: Train Loss: 0.3469, Test Loss: 0.4031


The Adam optimizer too has given us good overall performance. As per our observations, RMSProp is the best of the lot for optimizing the loss in our case. You've now explored a variety of optimization algorithms, each with unique approaches to navigating the complex landscape of neural network training. With this solid foundation of concepts, I’m sure you're well-equipped to apply them in practice. By thoughtfully selecting and implementing the right optimizer, our goal is to fine-tune our model's learning process for better performance and results. I’ll see you in the next video in which we shall build advanced neural models for real world projects.