In [11]:
# Cell 1: Imports & Data Preprocessing

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def preprocess_data(path: str,
                    test_size: float = 0.2,
                    random_state: int = 42):
    """
    1) Load CSV
    2) Drop identifier/leakage columns
    3) Map target to binary
    4) Drop rows with missing core columns
    5) Split into train/test
    6) Scale numeric features
    """
    df = pd.read_csv(path)

    # drop leakage columns
    drop_cols = [
        "student_id","destination_city","university_name",
        "course_name","placement_country","placement_company",
        "starting_salary_usd"
    ]
    df = df.drop(drop_cols, axis=1)

    # binary target
    df["placement_status"] = df["placement_status"]\
        .map({"Placed":1,"Not Placed":0})

    # drop missing in core
    core = [
        "placement_status","gpa_or_score","test_score",
        "field_of_study","origin_country","destination_country",
        "scholarship_received","enrollment_reason",
        "language_proficiency_test","visa_status",
        "post_graduation_visa","graduation_year",
        "year_of_enrollment"
    ]
    df = df.dropna(subset=core)

    # split
    X = df.drop("placement_status", axis=1)
    y = df["placement_status"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size,
        random_state=random_state, stratify=y
    )

    # scale numeric
    num_cols = ["gpa_or_score","test_score","year_of_enrollment","graduation_year"]
    scaler = StandardScaler().fit(X_train[num_cols])
    X_train[num_cols] = scaler.transform(X_train[num_cols])
    X_test[num_cols]  = scaler.transform(X_test[num_cols])

    return X_train, X_test, y_train, y_test


In [12]:
# Cell 2: Feature Engineering

import pandas as pd
from sklearn.cluster import KMeans

def engineer_features(X_train: pd.DataFrame,
                      X_test:  pd.DataFrame):
    """
    1) interactions, ratios, polynomials
    2) bucketize & count-encode & combine cats
    3) clustering
    4) one-hot & align
    """
    # apply to both
    for df in (X_train, X_test):
        df["study_duration"]       = df["graduation_year"] - df["year_of_enrollment"]
        df["gpa_test_interaction"] = df["gpa_or_score"] * df["test_score"]
        df["study_duration_sq"]    = df["study_duration"] ** 2
        df["gpa_test_ratio"]       = df["gpa_or_score"] / (df["test_score"]+1e-3)

        # buckets
        df["gpa_bucket"]  = pd.qcut(df["gpa_or_score"],3,labels=False)
        df["test_bucket"] = pd.qcut(df["test_score"],3,labels=False)

        # count-encode
        freq = df["enrollment_reason"].value_counts()
        df["enroll_reason_count"] = df["enrollment_reason"]\
            .map(freq).fillna(0).astype(int)

        # combined cat
        df["scholarship_visa_combo"] = (
            df["scholarship_received"].astype(str)
            + "_" + df["visa_status"].astype(str)
        )

    # clustering
    cols = ["gpa_or_score","test_score","study_duration"]
    kmeans = KMeans(n_clusters=5,random_state=42)
    X_train["num_cluster"] = kmeans.fit_predict(X_train[cols])
    X_test["num_cluster"]  = kmeans.predict(X_test[cols])

    # one-hot & align
    X_train = pd.get_dummies(X_train,drop_first=True)
    X_test  = pd.get_dummies(X_test, drop_first=True)
    X_train, X_test = X_train.align(X_test,join="left",axis=1,fill_value=0)

    return X_train, X_test


In [14]:
# Cell 3 (updated): Train & Save Multiple Models

import os
import joblib
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import warnings

# suppress the convergence warning so it doesn’t spam your output
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

models = {
    "randomforest":       RandomForestClassifier(random_state=42),
    "logisticregression": LogisticRegression(
                              solver="saga",
                              max_iter=5000,
                              random_state=42,
                              n_jobs=-1
                          ),
    "gradientboosting":   GradientBoostingClassifier(random_state=42),
    "svm":                SVC(probability=True, random_state=42),
    "knn":                KNeighborsClassifier()
}

os.makedirs("../models", exist_ok=True)

saved_models = {}
for name, clf in models.items():
    clf.fit(X_train_fe, y_train)
    path = f"../models/model_{name}.pkl"
    joblib.dump(clf, path)
    saved_models[name] = path

saved_models


{'randomforest': '../models/model_randomforest.pkl',
 'logisticregression': '../models/model_logisticregression.pkl',
 'gradientboosting': '../models/model_gradientboosting.pkl',
 'svm': '../models/model_svm.pkl',
 'knn': '../models/model_knn.pkl'}

In [None]:
# Cell 4: Evaluate All Saved Models

import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Iterate over each saved model, compute metrics on the engineered test set
results = []
for name, path in saved_models.items():
    model = joblib.load(path)
    y_pred = model.predict(X_test_fe)
    results.append({
        "model":    name,
        "accuracy":  accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall":    recall_score(y_test, y_pred, zero_division=0),
        "f1_score":  f1_score(y_test, y_pred, zero_division=0)
    })

# Create a DataFrame for easy comparison
metrics_df = pd.DataFrame(results).set_index("model")
metrics_df


In [6]:
# Cell 5: Full Pipeline Execution

# 1) Load & preprocess
X_train, X_test, y_train, y_test = preprocess_data(
    "../data/global_student_migration.csv"
)

# 2) Feature engineering
X_train_fe, X_test_fe = engineer_features(X_train, X_test)

# 3) Train & save
model = train_model(
    X_train_fe, y_train,
    model_path="../models/model.pkl"
)

# 4) Evaluate & save metrics
metrics = evaluate_model(
    model, X_test_fe, y_test,
    metrics_path="../reports/metrics.txt"
)

# 5) Display results
print("Evaluation metrics:", metrics)
metrics


Evaluation metrics: {'accuracy': 0.49129353233830847, 'precision': 0.49166666666666664, 'recall': 0.4392059553349876, 'f1_score': 0.4639580602883355}


{'accuracy': 0.49129353233830847,
 'precision': 0.49166666666666664,
 'recall': 0.4392059553349876,
 'f1_score': 0.4639580602883355}