In [9]:
# rnn_recommendation_pytorch.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random



In [10]:
# ----------------------------
# Reproducibility
# ----------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

# ----------------------------
# CONFIG
# ----------------------------
CSV_PATH = "/Users/ameen/Documents/M.tech/CBDP/ML/SAMPLE/pg_admissions_dataset.csv"  # path to your dataset
BATCH_SIZE = 64
EPOCHS = 40
LR = 1e-3
EMBED_DIM = 8
HIDDEN_SIZE = 64
TOP_K = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ----------------------------
# LOAD DATA
# ----------------------------
df = pd.read_csv("/Users/ameen/Documents/M.tech/CBDP/ML/SAMPLE/pg_admissions_dataset.csv")

cat_cols = [
    'UG_institute_tier', 'UG_branch', 'GATE_paper', 'Category',
    'program_id', 'university_name', 'dept_specialization',
    'university_tier', 'location_state'
]

num_cols = [
    'UG_percentage', 'Year_of_passing', 'GATE_score', 'GATE_rank',
    'Work_exp_years', 'previous_cutoff_gen', 'previous_cutoff_obc',
    'previous_cutoff_sc', 'previous_cutoff_st', 'min_UG_required',
    'difference_from_cutoff'
]

target_col = 'admitted_flag'

# Clean up
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
df[target_col] = df[target_col].astype(int)

# Encode categoricals
cat_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = df[col].astype(str)
    df[col] = le.fit_transform(df[col])
    cat_encoders[col] = le

# Scale numeric columns
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Train-test split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df[target_col])



In [11]:
# ----------------------------
# PyTorch Dataset
# ----------------------------
class AdmissionDataset(Dataset):
    def __init__(self, dataframe):
        self.df = dataframe

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        cat_features = row[cat_cols].values.astype(np.int64)
        num_features = row[num_cols].values.astype(np.float32)
        y = np.float32(row[target_col])
        return torch.tensor(cat_features), torch.tensor(num_features), torch.tensor(y)

train_ds = AdmissionDataset(train_df)
val_ds = AdmissionDataset(val_df)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)



In [12]:
# ----------------------------
# RNN Model Definition
# ----------------------------
class RNNAdmissionModel(nn.Module):
    def __init__(self, cat_cardinalities, num_numeric_features, embed_dim=8, hidden_size=64):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(cardinality, embed_dim) for cardinality in cat_cardinalities
        ])
        self.numeric_proj = nn.Linear(num_numeric_features, embed_dim)

        self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, cat_x, num_x):
        # cat_x: (batch, num_categorical)
        embedded = [emb(cat_x[:, i]) for i, emb in enumerate(self.embeddings)]
        embedded = torch.stack(embedded, dim=1)  # (batch, num_cat, embed_dim)

        # numeric input -> project to same embed_dim, then add as extra timestep
        num_emb = self.numeric_proj(num_x).unsqueeze(1)  # (batch, 1, embed_dim)

        # concat categorical + numeric timesteps
        seq = torch.cat([embedded, num_emb], dim=1)  # (batch, num_cat+1, embed_dim)
        _, (h_n, _) = self.lstm(seq)
        out = self.fc(h_n.squeeze(0))
        return out



In [13]:
# ----------------------------
# Model Init
# ----------------------------
cat_cardinalities = [df[col].nunique() + 1 for col in cat_cols]
num_numeric_features = len(num_cols)
model = RNNAdmissionModel(cat_cardinalities, num_numeric_features, EMBED_DIM, HIDDEN_SIZE).to(device)

# ----------------------------
# Loss, Optimizer
# ----------------------------
y_train = train_df[target_col].values
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
criterion = nn.BCELoss(weight=weights[1])  # weight positive class more
optimizer = optim.Adam(model.parameters(), lr=LR)



In [14]:
# ----------------------------
# Training Loop
# ----------------------------
def train_one_epoch():
    model.train()
    total_loss = 0
    for cat_x, num_x, y in train_loader:
        cat_x, num_x, y = cat_x.to(device), num_x.to(device), y.to(device).unsqueeze(1)
        optimizer.zero_grad()
        preds = model(cat_x, num_x)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(y)
    return total_loss / len(train_loader.dataset)

@torch.no_grad()
def validate():
    model.eval()
    total_loss = 0
    all_preds, all_true = [], []
    for cat_x, num_x, y in val_loader:
        cat_x, num_x, y = cat_x.to(device), num_x.to(device), y.to(device).unsqueeze(1)
        preds = model(cat_x, num_x)
        loss = criterion(preds, y)
        total_loss += loss.item() * len(y)
        all_preds.extend(preds.cpu().numpy().flatten())
        all_true.extend(y.cpu().numpy().flatten())
    auc = 0
    try:
        from sklearn.metrics import roc_auc_score
        auc = roc_auc_score(all_true, all_preds)
    except:
        pass
    return total_loss / len(val_loader.dataset), auc

best_auc = 0
for epoch in range(1, EPOCHS + 1):
    tr_loss = train_one_epoch()
    val_loss, val_auc = validate()
    print(f"Epoch {epoch:02d} | Train Loss: {tr_loss:.4f} | Val Loss: {val_loss:.4f} | Val AUC: {val_auc:.4f}")
    if val_auc > best_auc:
        best_auc = val_auc
        torch.save(model.state_dict(), "best_rnn_model.pt")



Epoch 01 | Train Loss: 0.5564 | Val Loss: 0.5500 | Val AUC: 0.8388
Epoch 02 | Train Loss: 0.5441 | Val Loss: 0.5358 | Val AUC: 0.7994
Epoch 03 | Train Loss: 0.5291 | Val Loss: 0.5188 | Val AUC: 0.7763
Epoch 04 | Train Loss: 0.5081 | Val Loss: 0.4908 | Val AUC: 0.7912
Epoch 05 | Train Loss: 0.4802 | Val Loss: 0.4645 | Val AUC: 0.8089
Epoch 06 | Train Loss: 0.4524 | Val Loss: 0.4375 | Val AUC: 0.8145
Epoch 07 | Train Loss: 0.4223 | Val Loss: 0.3995 | Val AUC: 0.8452
Epoch 08 | Train Loss: 0.3842 | Val Loss: 0.3645 | Val AUC: 0.8769
Epoch 09 | Train Loss: 0.3468 | Val Loss: 0.3365 | Val AUC: 0.9004
Epoch 10 | Train Loss: 0.3032 | Val Loss: 0.3240 | Val AUC: 0.9123
Epoch 11 | Train Loss: 0.2642 | Val Loss: 0.2639 | Val AUC: 0.9438
Epoch 12 | Train Loss: 0.2369 | Val Loss: 0.2443 | Val AUC: 0.9519
Epoch 13 | Train Loss: 0.2247 | Val Loss: 0.2488 | Val AUC: 0.9593
Epoch 14 | Train Loss: 0.2110 | Val Loss: 0.2242 | Val AUC: 0.9598
Epoch 15 | Train Loss: 0.1992 | Val Loss: 0.1966 | Val AUC: 0.

In [15]:
# ----------------------------
# Recommendation Function
# ----------------------------
program_df = df[['program_id', 'university_name', 'dept_specialization',
                 'previous_cutoff_gen', 'previous_cutoff_obc', 'previous_cutoff_sc',
                 'previous_cutoff_st', 'min_UG_required', 'university_tier',
                 'location_state']].drop_duplicates('program_id')

@torch.no_grad()
def recommend_programs(student_input, top_k=TOP_K):
    """
    student_input: dict with student's UG info, GATE, Category, etc.
    """
    model.eval()
    program_ids = program_df['program_id'].tolist()
    preds = []

    for pid in program_ids:
        # build feature vector row
        row = {}
        # Copy student inputs
        for c in ['UG_institute_tier', 'UG_branch', 'GATE_paper', 'Category']:
            row[c] = student_input.get(c, "NA")

        # Add program details
        prog = program_df[program_df['program_id'] == pid].iloc[0]
        row.update({
            'program_id': pid,
            'university_name': prog['university_name'],
            'dept_specialization': prog['dept_specialization'],
            'university_tier': prog['university_tier'],
            'location_state': prog['location_state'],
        })

        # Fill numeric fields
        for nc in num_cols:
            if nc in student_input:
                row[nc] = student_input[nc]
            elif nc in prog:
                row[nc] = prog[nc]
            else:
                row[nc] = 0.0

        # Convert to tensors
        cat_vals = []
        for col in cat_cols:
            val = row[col]
            if col in cat_encoders:
                le = cat_encoders[col]
                if val in le.classes_:
                    val_idx = le.transform([val])[0]
                else:
                    val_idx = 0
            else:
                val_idx = 0
            cat_vals.append(val_idx)

        num_vals = [row[c] for c in num_cols]
        num_vals = scaler.transform([num_vals])[0]

        cat_t = torch.tensor([cat_vals], dtype=torch.long).to(device)
        num_t = torch.tensor([num_vals], dtype=torch.float32).to(device)

        prob = model(cat_t, num_t).item()
        preds.append((pid, prob))

    results = pd.DataFrame(preds, columns=['program_id', 'prob']).merge(program_df, on='program_id', how='left')
    return results.sort_values('prob', ascending=False).head(top_k)

# ----------------------------
# Example Usage
# ----------------------------
example_student = {
    'UG_institute_tier': 'Tier-II',
    'UG_branch': 'CSE',
    'UG_percentage': 78.5,
    'Year_of_passing': 2023,
    'GATE_paper': 'CSE',
    'GATE_score': 32.0,
    'GATE_rank': 1200,
    'Category': 'General',
    'Work_exp_years': 0
}

recs = recommend_programs(example_student, TOP_K)
print("\nTop Recommendations:")
print(recs[['program_id', 'university_name', 'dept_specialization', 'prob']])



Top Recommendations:
    program_id  university_name  dept_specialization      prob
8           11               10                    3  0.999342
18          15                9                    3  0.999333
17          16               12                    4  0.999309
11          12               13                    3  0.999283
6            8               11                    4  0.999267




*** streamlit ***

13
