<a href="https://colab.research.google.com/github/abhigoel25/StudentPerformancePredictionModel/blob/main/StudentPerformancePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from google.colab import files
import io

import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import datasets, transforms, models

In [None]:
uploaded = files.upload()
print(uploaded)

In [None]:
df = pd.read_csv(io.BytesIO(uploaded["Student_performance_data _.csv"]))
df

In [None]:
df = df.drop(columns = ['StudentID'])
df

In [None]:
columns_to_encode = ['Ethnicity', 'ParentalEducation']
df = pd.get_dummies(df, columns = columns_to_encode).astype(float)
df

In [None]:
grades = df['GradeClass']  #Shows that better GPA corresponds to better grade class (grade class of 0 is best grade class)
gpa = df['GPA']

plt.scatter(gpa, grades)
plt.xlabel('GPA')
plt.ylabel('Grade')
plt.title('Scatter Plot of GPA vs Grade Class for Students')
plt.show()

In [None]:
y = df['GradeClass']
x = df.drop(columns = ['GradeClass']).values

print("x.shape", x.shape)
print("y.shape", y.shape)

print(np.isnan(x).any())
print(np.isnan(y).any())

scaler = StandardScaler()
columns_to_standardize = ['Age', 'StudyTimeWeekly', 'Absences']
df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])
df

In [None]:
class CustomTransform():
    def __call__(self, sample):
        x, y = sample
        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)
        return x, y

class CustomDataset(Dataset):
    def __init__(self, x, y, transform):
        self.x = x
        self.y = y
        self.transform = transform

    def __len__(self): # return how many rows are in the dataset
        return len(self.x)

    def __getitem__(self, idx): # return a single example from the dataset
        sample = (self.x[idx], self.y[idx])

        if self.transform:
            sample = self.transform(sample)

        return sample

In [None]:
transform = CustomTransform()
dataset = CustomDataset(x, y, transform=transform)


In [None]:
train_size = int(0.7 * len(dataset))
dev_size = int(0.15 * len(dataset))
test_size = len(dataset) - dev_size - train_size

print(train_size)
print(dev_size)
print(test_size)

train_dataset, dev_dataset, test_dataset = random_split(dataset, [train_size, dev_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=30, shuffle=True) # training set
dev_dataloader = DataLoader(dev_dataset, batch_size=30, shuffle=False) # validation set
test_dataloader = DataLoader(test_dataset, batch_size=30, shuffle=False)


i=0
for batch_x, batch_y in train_dataloader:
    print("x.shape = {}".format(batch_x.shape), "y.shape = {}".format(batch_y.shape))
    print(batch_x)
    print(batch_y)
    i += 1
    if i > 5:
      break

In [None]:
class StudentPerformanceNN(nn.Module):
  def __init__(self, input_size):
    super(StudentPerformanceNN, self).__init__()
    self.fc1 = nn.Linear(input_size, 128)
    self.bn1 = nn.BatchNorm1d(128)
    self.fc2 = nn.Linear(128, 64)
    self.bn2 = nn.BatchNorm1d(64)
    self.fc3 = nn.Linear(64, 32)
    self.bn3 = nn.BatchNorm1d(32)
    self.fc4 = nn.Linear(32, 5)
    self.relu = nn.ReLU()
    self.softmax = nn.Softmax()

  def forward(self, x):
      x = self.fc1(x)
      x = self.bn1(x)
      x = self.relu(x)
      x = self.fc2(x)
      x = self.bn2(x)
      x = self.relu(x)
      x = self.fc3(x)
      x = self.bn3(x)
      x = self.relu(x)
      x = self.fc4(x)
      x = self.softmax(x)
      return x

In [None]:
# Initialize the model, loss function, and optimizer
input_size = x.shape[1]
train_accuracies = []
val_accuracies = []
train_loss =  []
val_loss = []
model = StudentPerformanceNN(input_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0009)


# Training loop
num_epochs = 50

for epoch in range(num_epochs):
  correct_train = 0
  total_train = 0
  i = 0
  for batch_x, batch_y in train_dataloader:

    # Forward pass
    outputs = model(batch_x)
    batch_y = batch_y.view(-1, 1) # reshaping to conform to outputs prediction shape
    batch_y = batch_y.squeeze(1).long() # removing the extra dimension
    #print(batch_y)
    #print(outputs)
    #print(batch_y)
    #print(f'Outputs shape: {outputs.shape}')  # Should be [batch_size, num_classes]
    #print(f'Targets shape: {batch_y.shape}')  # Should be [batch_size]
    loss = criterion(outputs, batch_y)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Calculate training accuracy
    _, predicted = torch.max(outputs.data, 1) # once you make predictions, torch.max takes the max probability for each example and selects the corresponding class
    total_train += batch_y.size(0)
    correct_train += (predicted == batch_y).sum().item() # how many predictions were correct
    i +=1
    if i % 25 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}')
  train_accuracy = 100 * correct_train / total_train
  train_accuracies.append(train_accuracy)
  train_loss.append(loss.item())
  print(f"Train Accuracy: {train_accuracy:.2f}%")

  #Validation Loop
  for batch_x, batch_y in dev_dataloader:
    model.eval()
    correct_val = 0
    total_val = 0

    with torch.no_grad():
      outputs = model(batch_x)
      batch_y = batch_y.view(-1, 1) # reshaping to conform to outputs prediction shape
      batch_y = batch_y.squeeze(1).long() # removing the extra dimension
      loss = criterion(outputs, batch_y)

    _, predicted = torch.max(outputs.data, 1) # once you make predictions, torch.max takes the max probability for each example and selects the corresponding class
    total_val += batch_y.size(0)
    correct_val += (predicted == batch_y).sum().item() # how many predictions were correct
    i +=1
    if i % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dev_dataloader)}], Loss: {loss.item():.4f}')
  val_accuracy = 100 * correct_val / total_val
  val_accuracies.append(val_accuracy)
  val_loss.append(loss.item())

  print(f"Val Accuracy: {val_accuracy:.2f}%")


plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs+1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, num_epochs+1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.title('Train and Validation Accuracy over Epochs')
plt.legend()
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs+1), train_loss, label='Train Loss')
plt.plot(range(1, num_epochs+1), val_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Cross Entropy Loss')
plt.title('Train and Validation Loss over Epochs')
plt.legend()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_curve
from sklearn.preprocessing import label_binarize

In [None]:
# Test Loop
test_accuracies = []
model.eval()
correct_test = 0
total_test = 0
test_loss = 0.0
num_classes = 5

with torch.no_grad():
  for batch_x, batch_y in test_dataloader:
    outputs = model(batch_x)
    batch_y = batch_y.view(-1, 1) # reshaping to conform to outputs prediction shape
    batch_y = batch_y.squeeze(1).long() # removing the extra dimension
    test_loss = criterion(outputs, batch_y)


    _, predicted = torch.max(outputs.data, 1) # once you make predictions, torch.max takes the max probability for each example and selects the corresponding class
    y_prediction = predicted.cpu().numpy()
    y_target = batch_y.cpu().numpy()
    total_test += batch_y.size(0)
    print(f'Output Prediction: {predicted}')
    print(f'Target Prediction: {batch_y}')
    correct_test += (predicted == batch_y).sum().item() # how many predictions were correct

    class_accuracies = []
    for i in range(num_classes):
      class_indices = [j for j, x in enumerate(y_target) if x == i]
      class_accuracy = accuracy_score([y_target[j] for j in class_indices], [y_prediction[j] for j in class_indices])
      class_accuracies.append(class_accuracy)

plt.figure(figsize=(10, 6))
plt.bar(range(num_classes), class_accuracies)
plt.xlabel('Class')
plt.ylabel('Accuracy')
plt.title('Class-wise Accuracy')
plt.xticks(range(num_classes))
plt.show()

print(f'Epoch [{epoch+1}/{num_epochs}], Test Loss: {test_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, Test Accuracy: {test_accuracy:.2f}%')

'''
    cm = confusion_matrix(y_target, y_prediction)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()
    test_loss /= len(test_dataloader)
    test_accuracy = 100 * correct_test / total_test
    test_accuracies.append(test_accuracy)
'''

In [None]:
# Define the accuracies
train_accuracy = train_accuracies[-1]/100
validation_accuracy = val_accuracies[-1]/100
test_accuracy = test_accuracies[-1]/100

# Define the labels and the values
labels = ['Train Accuracy', 'Validation Accuracy', 'Test Accuracy']
accuracies = [train_accuracy, validation_accuracy, test_accuracy]

# Create the bar plot
plt.figure(figsize=(8, 6))
plt.bar(labels, accuracies, color=['blue', 'green', 'red'])
plt.ylim(0, 1)
plt.xlabel('Accuracy Type')
plt.ylabel('Accuracy')
plt.title('Train, Validation, and Test Accuracy')
plt.show()
