<a href="https://colab.research.google.com/github/ash-iiiiish/New-Project/blob/main/Exam_score_Prediction_Competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting Student Scores : Competition Task

## Importing Libraries

In [None]:
import pandas as pd

## Data Collection

In [None]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
print(train.shape,test.shape)


(630000, 13) (270000, 12)


## Prepare Data and Preprocessor




In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

print("Columns in train DataFrame:", train.columns.tolist())

# Define target variable
target_column_name = 'score'

# Check if 'score' column exists directly
if target_column_name not in train.columns:
    print(f"Warning: '{target_column_name}' column not found directly in train DataFrame. Attempting to infer target column...")
    # If 'score' is not found, try to infer the target column:
    # it's typically the one present in train but not in test, and not 'id'
    train_cols_set = set(train.columns)
    test_cols_set = set(test.columns)
    # Exclude 'id' as it's usually an identifier, not a feature or the target itself
    potential_target_cols = list(train_cols_set - test_cols_set - {'id'})

    if len(potential_target_cols) == 1:
        target_column_name = potential_target_cols[0]
        print(f"Inferred target column name: '{target_column_name}'")
    elif len(potential_target_cols) > 1:
        raise KeyError(f"Could not uniquely identify the target column. Expected 'score' or a single unique column in train not in test. Found multiple potential target columns: {potential_target_cols}")
    else:
        raise KeyError(f"Could not identify the target column. Neither '{target_column_name}' nor a unique column from train vs. test was found.")

y = train[target_column_name]

# Identify numerical and categorical columns
# Exclude 'id' and the identified target column from features
all_features = [col for col in train.columns if col not in ['id', target_column_name]]

numerical_cols = train[all_features].select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = train[all_features].select_dtypes(include=['object', 'bool']).columns.tolist()

# 1. Define a list named feature_cols by combining numerical_cols and categorical_cols
feature_cols = numerical_cols + categorical_cols

# 2. Create a DataFrame X_full by selecting the columns specified in feature_cols from the train DataFrame.
X_full = train[feature_cols].copy()

# 3. Create a DataFrame X_test by selecting the columns specified in feature_cols from the test DataFrame.
X_test = test[feature_cols].copy()

# 4. Save the 'id' column from the original test DataFrame into a new variable, test_ids.
test_ids = test['id']

# 5. ColumnTransformer, StandardScaler, OneHotEncoder are imported above.

# 6. Initialize a StandardScaler for numerical features and an OneHotEncoder with handle_unknown='ignore' for categorical features.
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# 7. Create a ColumnTransformer named preprocessor that applies the StandardScaler to the numerical_cols
#    and the OneHotEncoder to the categorical_cols.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# 8. Fit the preprocessor on X_full and then transform both X_full and X_test
#    using this fitted preprocessor. Store the results in X_full_processed and X_test_processed respectively.
X_full_processed = preprocessor.fit_transform(X_full)
X_test_processed = preprocessor.transform(X_test)

print("X_full_processed shape:", X_full_processed.shape)
print("X_test_processed shape:", X_test_processed.shape)
print("test_ids head:", test_ids.head())

Columns in train DataFrame: ['id', 'age', 'gender', 'course', 'study_hours', 'class_attendance', 'internet_access', 'sleep_hours', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty', 'exam_score']
Inferred target column name: 'exam_score'
X_full_processed shape: (630000, 30)
X_test_processed shape: (270000, 30)
test_ids head: 0    630000
1    630001
2    630002
3    630003
4    630004
Name: id, dtype: int64


## Define PyTorch MLP Regressor Model



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class MLPRegressor(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_rate):
        super(MLPRegressor, self).__init__()
        self.hidden_layers = nn.ModuleList()

        # First hidden layer
        self.hidden_layers.append(nn.Linear(input_size, hidden_size))

        # Additional hidden layers
        for _ in range(num_layers - 1):
            self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))

        self.dropout = nn.Dropout(dropout_rate)
        self.output_layer = nn.Linear(hidden_size, 1)

    def forward(self, x):
        for layer in self.hidden_layers:
            x = torch.relu(layer(x))
            x = self.dropout(x)
        x = self.output_layer(x)
        return x

print("MLPRegressor class defined successfully.")

MLPRegressor class defined successfully.


## Implement Optuna Objective Function



In [None]:
!pip install optuna
import optuna
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

def objective(trial):
    # 3. Suggest hyperparameters
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    hidden_size = trial.suggest_int('hidden_size', 32, 256, step=32)
    num_layers = trial.suggest_int('num_layers', 1, 4)
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5, step=0.05)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])
    l2_lambda = trial.suggest_float('l2_lambda', 1e-7, 1e-5, log=True)

    # 4. Split data into training and validation sets
    # y is the target variable from previous steps
    X_train_np, X_val_np, y_train_np, y_val_np = train_test_split(
        X_full_processed, y.values, test_size=0.2, random_state=42
    )

    # 5. Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train_np, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_np, dtype=torch.float32).reshape(-1, 1)
    X_val_tensor = torch.tensor(X_val_np, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val_np, dtype=torch.float32).reshape(-1, 1)

    # 6. Create TensorDataset and DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # 7. Determine input_size
    input_size = X_full_processed.shape[1]

    # 8. Instantiate the MLPRegressor model
    model = MLPRegressor(
        input_size=input_size,
        hidden_size=hidden_size,
        num_layers=num_layers,
        dropout_rate=dropout_rate
    )

    # 9. Define criterion
    criterion = nn.MSELoss()

    # 10. Define optimizer
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=l2_lambda)

    # 11. Training loop
    n_epochs = 20  # Fixed number of epochs for Optuna trials
    for epoch in range(n_epochs):
        model.train()  # Set model to training mode
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

        # Evaluate on validation set
        model.eval()  # Set model to evaluation mode
        val_predictions = []
        val_targets = []
        with torch.no_grad():
            for X_batch_val, y_batch_val in val_loader:
                outputs_val = model(X_batch_val)
                val_predictions.extend(outputs_val.squeeze().tolist())
                val_targets.extend(y_batch_val.squeeze().tolist())

        val_rmse = np.sqrt(mean_squared_error(val_targets, val_predictions))

        trial.report(val_rmse, epoch)

        # Pruning
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    # 12. Return the final validation RMSE
    return val_rmse

print("Optuna objective function defined successfully.")

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0
Optuna objective function defined successfully.


## Run Optuna Study for Hyperparameter Tuning

In [None]:
import optuna

# 2. Create an Optuna study object
study = optuna.create_study(direction='minimize')

# 3. Execute the study
study.optimize(objective, n_trials=50)

# 4. Print the best trial's value
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best trial's value (RMSE): {study.best_value}")

# 5. Print the best trial's hyperparameters
best_params = study.best_params
print("Best trial's hyperparameters:")
for key, value in best_params.items():
    print(f"  {key}: {value}")

[I 2026-01-05 15:00:49,328] A new study created in memory with name: no-name-6ee140b7-ca97-4a3d-b65f-a473455e0355
[I 2026-01-05 15:21:31,812] Trial 0 finished with value: 9.079089542522404 and parameters: {'learning_rate': 1.2500421558343713e-05, 'hidden_size': 64, 'num_layers': 3, 'dropout_rate': 0.4, 'batch_size': 16, 'l2_lambda': 2.585251181477964e-07}. Best is trial 0 with value: 9.079089542522404.
[I 2026-01-05 15:25:58,384] Trial 1 finished with value: 8.978055131072301 and parameters: {'learning_rate': 0.00024268397741331475, 'hidden_size': 64, 'num_layers': 3, 'dropout_rate': 0.35000000000000003, 'batch_size': 128, 'l2_lambda': 7.753951690215961e-06}. Best is trial 1 with value: 8.978055131072301.
[I 2026-01-05 15:52:58,626] Trial 2 finished with value: 8.903771501957143 and parameters: {'learning_rate': 4.641570712658341e-05, 'hidden_size': 256, 'num_layers': 2, 'dropout_rate': 0.45, 'batch_size': 16, 'l2_lambda': 4.384373045329192e-06}. Best is trial 2 with value: 8.903771501