In [1]:
import pandas as pd

In [2]:
# Loading data
data = pd.read_csv('Prodigy University Dataset.csv')
# Split the data into features (X) and target (y)
data.head()

Unnamed: 0,sat_sum,hs_gpa,fy_gpa
0,508,3.4,3.18
1,488,4.0,3.33
2,464,3.75,3.25
3,380,3.75,2.42
4,428,4.0,2.63


In [3]:
# Converting data to numpy
X = data[['sat_sum', 'hs_gpa']].values
# reshape the fy_gpa into a 2D array with [data_size] rows and 1 column
y = data['fy_gpa'].values.reshape(-1, 1)
print(X.shape)
print(y.shape)

(1000, 2)
(1000, 1)


In [4]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.preprocessing import StandardScaler

# Normalize the features so that it is easier to train the data
scaler = StandardScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.fit_transform(X_test)

In [6]:
X_train.shape

(800, 2)

In [7]:
import torch
# Convert numpy to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [8]:
import torch.nn as nn

In [9]:
# Building model with 2 neurons
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)

In [10]:
# Forward Propagation
preds = model(X_train_tensor)

In [11]:
preds[:5]

tensor([[-0.2094],
        [-0.1946],
        [-0.0843],
        [-0.1771],
        [-0.1015]], grad_fn=<SliceBackward0>)

In [12]:
from torch.nn import MSELoss

In [13]:
# Calculating Loss
criterion = MSELoss()
loss = criterion(preds, y_train_tensor)
print(loss)
# to learners: You may get different values

tensor(7.3339, grad_fn=<MseLossBackward0>)


# Comparing predictions on X_train with Target

In [14]:
preds[:5]

tensor([[-0.2094],
        [-0.1946],
        [-0.0843],
        [-0.1771],
        [-0.1015]], grad_fn=<SliceBackward0>)

In [15]:
y_train_tensor[:5]

tensor([[2.0000],
        [3.1100],
        [1.6300],
        [3.0200],
        [1.5500]])

In [16]:
model[0].weight

Parameter containing:
tensor([[ 0.2805, -0.5674],
        [-0.1199, -0.6597]], requires_grad=True)

In [17]:
model[2].weight

Parameter containing:
tensor([[0.3679, 0.1311]], requires_grad=True)

---

In [18]:
# One step of updating Weights

In [19]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr = 0.001)

In [20]:
loss.backward()
optimizer.step()

In [21]:
model[0].weight

Parameter containing:
tensor([[ 0.2806, -0.5674],
        [-0.1199, -0.6597]], requires_grad=True)

In [22]:
model[2].weight

Parameter containing:
tensor([[0.3700, 0.1337]], requires_grad=True)

In [23]:
from torch.utils.data import TensorDataset, DataLoader

In [24]:
train_data = TensorDataset(X_train_tensor, y_train_tensor)

In [25]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.SGD(model.parameters(), lr = 0.001)

In [26]:
# performance on train  and test sets  before training
train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
print(f'Without Training:\nTrain Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Without Training:
Train Loss: 8.7319, Test Loss: 9.0349


In [27]:
# Looking at predictions
model(X_train_tensor)[:5]

tensor([[-0.3023],
        [-0.3038],
        [-0.2824],
        [-0.3749],
        [-0.2386]], grad_fn=<SliceBackward0>)

# Stochastic Gradient Descent

In [28]:
train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
# Execute the training loop
for epoch in range(10):
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
    # print(epoch,': ', train_loss)
    test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
    print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 1: Train Loss: 0.4841, Test Loss: 0.5649
Epoch 2: Train Loss: 0.3706, Test Loss: 0.4299
Epoch 3: Train Loss: 0.3623, Test Loss: 0.4170
Epoch 4: Train Loss: 0.3572, Test Loss: 0.4150
Epoch 5: Train Loss: 0.3536, Test Loss: 0.4099
Epoch 6: Train Loss: 0.3509, Test Loss: 0.4078
Epoch 7: Train Loss: 0.3492, Test Loss: 0.4094
Epoch 8: Train Loss: 0.3472, Test Loss: 0.4068
Epoch 9: Train Loss: 0.3461, Test Loss: 0.4048
Epoch 10: Train Loss: 0.3453, Test Loss: 0.4047


In [29]:
# Looking at predictions
model(X_train_tensor)[:5]

tensor([[2.2027],
        [2.2019],
        [2.0986],
        [2.4425],
        [1.9764]], grad_fn=<SliceBackward0>)

# Batch Gradient Descent

In [30]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.SGD(model.parameters(), lr = 0.001)

In [31]:
train_loader = DataLoader(train_data, batch_size=800, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(1000): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 100 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 100: Train Loss: 3.4033, Test Loss: 3.6030
Epoch 200: Train Loss: 1.9215, Test Loss: 2.0787
Epoch 300: Train Loss: 1.1776, Test Loss: 1.3044
Epoch 400: Train Loss: 0.8047, Test Loss: 0.9098
Epoch 500: Train Loss: 0.6182, Test Loss: 0.7079
Epoch 600: Train Loss: 0.5249, Test Loss: 0.6037
Epoch 700: Train Loss: 0.4779, Test Loss: 0.5489
Epoch 800: Train Loss: 0.4537, Test Loss: 0.5194
Epoch 900: Train Loss: 0.4408, Test Loss: 0.5026
Epoch 1000: Train Loss: 0.4335, Test Loss: 0.4926


# Mini-Batch Gradient Descent

In [32]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.SGD(model.parameters(), lr = 0.001)

In [33]:
train_loader = DataLoader(train_data, batch_size= 64, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(500): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 50 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 50: Train Loss: 0.5692, Test Loss: 0.6446
Epoch 100: Train Loss: 0.4402, Test Loss: 0.4949
Epoch 150: Train Loss: 0.3982, Test Loss: 0.4508
Epoch 200: Train Loss: 0.3750, Test Loss: 0.4278
Epoch 250: Train Loss: 0.3619, Test Loss: 0.4152
Epoch 300: Train Loss: 0.3544, Test Loss: 0.4086
Epoch 350: Train Loss: 0.3503, Test Loss: 0.4051
Epoch 400: Train Loss: 0.3479, Test Loss: 0.4031
Epoch 450: Train Loss: 0.3465, Test Loss: 0.4022
Epoch 500: Train Loss: 0.3457, Test Loss: 0.4018


Run all cells till here

Let's quickly run the model using the new techniques we just looked at that is GD with Momentum and Nesterov Momentum. Let's begin with GD with momentum. 

# Gradient Descent with Momentum

In [34]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum=0.9)

Here, we've introduced a momentum of 0.9 to the SGD optimizer. Ensure that you add the momentum parameter to the optimizer else the model would simply use the basic SGD.

In [35]:
train_loader = DataLoader(train_data, batch_size= 64, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(500): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 50 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 50: Train Loss: 0.3634, Test Loss: 0.4161
Epoch 100: Train Loss: 0.3498, Test Loss: 0.4048
Epoch 150: Train Loss: 0.3479, Test Loss: 0.4036
Epoch 200: Train Loss: 0.3466, Test Loss: 0.4026
Epoch 250: Train Loss: 0.3455, Test Loss: 0.4019
Epoch 300: Train Loss: 0.3446, Test Loss: 0.4014
Epoch 350: Train Loss: 0.3439, Test Loss: 0.4012
Epoch 400: Train Loss: 0.3433, Test Loss: 0.4006
Epoch 450: Train Loss: 0.3427, Test Loss: 0.4008
Epoch 500: Train Loss: 0.3423, Test Loss: 0.4008


Here, we just observed how with a momentum 0.9 we reached to lower value of loss in a much faster manner! 

Let's quickly run the Nesterov Momentum on our dataset and evaluate the losses.

# Nesterov Momentum

The code is the almost the same as SGD with Momentum, but all you have to do is set the nesterov parameter to true.

In [36]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum=0.9, nesterov=True)

In [37]:
train_loader = DataLoader(train_data, batch_size= 64, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(500): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 50 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 50: Train Loss: 0.3617, Test Loss: 0.4159
Epoch 100: Train Loss: 0.3480, Test Loss: 0.4049
Epoch 150: Train Loss: 0.3462, Test Loss: 0.4051
Epoch 200: Train Loss: 0.3453, Test Loss: 0.4046
Epoch 250: Train Loss: 0.3446, Test Loss: 0.4042
Epoch 300: Train Loss: 0.3440, Test Loss: 0.4041
Epoch 350: Train Loss: 0.3435, Test Loss: 0.4040
Epoch 400: Train Loss: 0.3431, Test Loss: 0.4035
Epoch 450: Train Loss: 0.3427, Test Loss: 0.4034
Epoch 500: Train Loss: 0.3424, Test Loss: 0.4028


The loss calculations are almost at par with the GD with momentum. Our final train and test loss stand at VALUE and VALUE.

Things are getting interesting aren’t they. Feel free to revisit the concepts we have covered so far before we move to the next optimizer which is AdaGrad..


# AdaGrad

Next, let's quickly run each of the different optimizers we just looked at starting with AdaGrad. Note that here we've to specify optim.Adagead(model.parameters()) to intialize the model with Adagrad. If you recall, Adagrad adjusts the learning rates of each parameter based on the historical gradients. 

In [38]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.Adagrad(model.parameters())

In [39]:
train_loader = DataLoader(train_data, batch_size= 64, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(500): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 50 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 50: Train Loss: 4.5387, Test Loss: 4.7627
Epoch 100: Train Loss: 2.8909, Test Loss: 3.0783
Epoch 150: Train Loss: 1.9554, Test Loss: 2.1167
Epoch 200: Train Loss: 1.3775, Test Loss: 1.5177
Epoch 250: Train Loss: 1.0100, Test Loss: 1.1330
Epoch 300: Train Loss: 0.7737, Test Loss: 0.8825
Epoch 350: Train Loss: 0.6219, Test Loss: 0.7193
Epoch 400: Train Loss: 0.5249, Test Loss: 0.6129
Epoch 450: Train Loss: 0.4632, Test Loss: 0.5438
Epoch 500: Train Loss: 0.4242, Test Loss: 0.4989


Adagrad has given us a high intial loss but the final loss values of VALUE on the train data and VALUE on the test data. Next let's try RMSProp!

# RMS Prop

Just like we did with Adagrad earlier, here we need to use optim.RMSprop to intitialize the model with RMSProp. Even though we are going to use the default parameters, RMSprop also has parameters such as, learning rate, momentum etc, which you can feel free to try out! Let's run the code.

In [40]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.RMSprop(model.parameters())

In [41]:
train_loader = DataLoader(train_data, batch_size= 64, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(500): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 50 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 50: Train Loss: 0.3422, Test Loss: 0.4019
Epoch 100: Train Loss: 0.3423, Test Loss: 0.3995
Epoch 150: Train Loss: 0.3426, Test Loss: 0.4072
Epoch 200: Train Loss: 0.3394, Test Loss: 0.4010
Epoch 250: Train Loss: 0.3410, Test Loss: 0.3983
Epoch 300: Train Loss: 0.3389, Test Loss: 0.4006
Epoch 350: Train Loss: 0.3392, Test Loss: 0.4004
Epoch 400: Train Loss: 0.3389, Test Loss: 0.4011
Epoch 450: Train Loss: 0.3396, Test Loss: 0.4029
Epoch 500: Train Loss: 0.3399, Test Loss: 0.4042


With RMSProp we have clearly achieved our lowest loss values so far. We have got final loss values of VALUE on the train data and VALUE on the test data

Next, let's try the Adam Optimizer. For this, we need to use optim.Adam. Take a look at the code.

# Adam

In [42]:
# Reinitialising model weights
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1)
)
optimizer = optim.Adam(model.parameters())

In [43]:
train_loader = DataLoader(train_data, batch_size= 64, shuffle=True) #800 is the number of samples in train set
# Execute the training loop
for epoch in range(500): # increasing the epochs for effective training
    for X_batch, y_batch in train_loader:
        # Forward pass
        pred = model(X_batch)
        loss = criterion(pred, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 50 == 0: # printing after every 100 epochs
        train_loss = criterion(model(X_train_tensor), y_train_tensor).item()
        # print(epoch,': ', train_loss)
        test_loss = criterion(model(X_test_tensor), y_test_tensor).item()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch 50: Train Loss: 1.0507, Test Loss: 1.1651
Epoch 100: Train Loss: 0.3815, Test Loss: 0.4432
Epoch 150: Train Loss: 0.3593, Test Loss: 0.4122
Epoch 200: Train Loss: 0.3582, Test Loss: 0.4109
Epoch 250: Train Loss: 0.3569, Test Loss: 0.4101
Epoch 300: Train Loss: 0.3554, Test Loss: 0.4089
Epoch 350: Train Loss: 0.3536, Test Loss: 0.4077
Epoch 400: Train Loss: 0.3519, Test Loss: 0.4068
Epoch 450: Train Loss: 0.3502, Test Loss: 0.4053
Epoch 500: Train Loss: 0.3486, Test Loss: 0.4048


The Adam optimizer too has given us good overall performance. As per our observations, RMSProp is the best of the lot for optimizing the loss in our case. You've now explored a variety of optimization algorithms, each with unique approaches to navigating the complex landscape of neural network training. With this solid foundation of concepts, I’m sure you're well-equipped to apply them in practice. By thoughtfully selecting and implementing the right optimizer, our goal is to fine-tune our model's learning process for better performance and results. I’ll see you in the next video in which we shall build advanced neural models for real world projects.