In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error

In [2]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
label_encoder = LabelEncoder()
standard_scaler = StandardScaler()

In [3]:
# Mount Google drive and copy kaggle.json to local disk

from google.colab import drive
drive.mount('mount')

!cp mount/MyDrive/kaggle.json .

Mounted at mount


In [4]:
!KAGGLE_CONFIG_DIR=$(pwd) kaggle datasets download rabieelkharoua/students-performance-dataset

Dataset URL: https://www.kaggle.com/datasets/rabieelkharoua/students-performance-dataset
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading students-performance-dataset.zip to /content
  0% 0.00/66.2k [00:00<?, ?B/s]
100% 66.2k/66.2k [00:00<00:00, 45.0MB/s]


In [5]:
!unzip -o students-performance-dataset.zip -d students-performance-dataset

Archive:  students-performance-dataset.zip
  inflating: students-performance-dataset/Student_performance_data _.csv  


In [6]:
student_performance_df_orig = pd.read_csv('students-performance-dataset/Student_performance_data _.csv')

In [7]:
student_performance_df = student_performance_df_orig.drop(['StudentID', 'GradeClass'], axis=1)
student_performance_df

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA
0,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196
1,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915
2,15,0,2,3,4.210570,26,0,2,0,0,0,0,0.112602
3,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218
4,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,18,1,0,3,10.680555,2,0,4,1,0,0,0,3.455509
2388,17,0,0,1,7.583217,4,1,4,0,1,0,0,3.279150
2389,16,1,0,2,6.805500,20,0,2,0,0,0,1,1.142333
2390,16,1,1,0,12.416653,17,0,2,0,1,1,0,1.803297


In [8]:
!python3 -m pip install uv
!uv pip install catboost lightgbm xgboost --system

Collecting uv
  Downloading uv-0.5.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.5.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.5/14.5 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.5.4
[2mUsing Python 3.10.12 environment at /usr[0m
[2K[2mResolved [1m22 packages[0m [2min 460ms[0m[0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)
[2K[1A[37m⠙

In [19]:
df = student_performance_df_orig.copy()

# Create a simple 'group_id' column, for example we could group by a certain feature like 'Age' or manually
# We'll assume here that students with similar Age are in the same group (this is just for demonstration)
df['group_id'] = df['Age']

# Create ranks based on the GPA within each group (higher GPA should get a higher rank)
df['rank'] = df.groupby('group_id')['GPA'].rank(ascending=False, method='min')

min_rank = df['rank'].min()
max_rank = df['rank'].max()

scaled_rank = 30 * (df['rank'] - min_rank) / (max_rank - min_rank)
df['rank'] = scaled_rank
df['rank'] = df['rank'].astype(int)

df = df.sort_values(by='group_id')

# Define features (X) and target (y)
X = df.drop(columns=['GPA', 'rank', 'group_id'])  # Drop the target column 'GPA', 'rank' and 'group_id'
y = df['rank']

# Create a group array that defines the number of samples in each group
group = df.groupby('group_id').size().values  # This will give the group sizes

# Ensure the sum of group sizes matches the number of rows
assert np.sum(group) == len(df), f"Sum of group sizes {np.sum(group)} does not match data size {len(df)}"

In [20]:
import lightgbm as lgb

# Create a LightGBM Dataset
train_data = lgb.Dataset(X, label=y, group=group)

# Set parameters for ranking task
params = {
    'objective': 'lambdarank',  # LambdaRank for ranking
    'metric': 'ndcg',           # Use NDCG for evaluation
    'boosting_type': 'gbdt',    # Use Gradient Boosting Decision Trees
    'num_leaves': 31,           # Hyperparameter for leaf nodes in trees
    'learning_rate': 0.05,      # Learning rate
    'verbose': -1               # Suppress LightGBM output
}

# Train the model
num_round = 100
lightgbm_ranker = lgb.train(params, train_data, num_round)

y_pred = lightgbm_ranker.predict(X)

ranked_indices = np.argsort(y_pred)[::-1]  # Sort in descending order to get ranks

predicted_ranks = np.zeros_like(y_pred, dtype=int)
predicted_ranks[ranked_indices] = np.arange(1, len(y_pred) + 1)

print("Predicted Ranks:", predicted_ranks)
print("Predicted Ranks Length:", len(predicted_ranks))

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Predicted Ranks: [1735 1484 1526 ...  282 2105 2100]
Predicted Ranks Length: 2392


In [21]:
import xgboost as xgb

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X, label=y)
dtrain.set_group(group)  # Set the group size

params = {
    'objective': 'rank:pairwise',
    'metric': 'ndcg',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'verbose': 0,
}

num_round = 100
xgboost_ranker = xgb.train(params, dtrain, num_round)

# Predict the ranks for the input data
y_pred = xgboost_ranker.predict(dtrain)

ranked_indices = np.argsort(y_pred)[::-1]  # Sort in descending order to get ranks

predicted_ranks = np.zeros_like(y_pred, dtype=int)
predicted_ranks[ranked_indices] = np.arange(1, len(y_pred) + 1)

print("Predicted Ranks:", predicted_ranks)
print("Predicted Ranks Length:", len(predicted_ranks))

Parameters: { "boosting_type", "metric", "num_leaves", "verbose" } are not used.



Predicted Ranks: [1927 1338 1788 ...  311 1941 2392]
Predicted Ranks Length: 2392


In [22]:
import catboost as cb

# Create CatBoost Pool
train_data = cb.Pool(X, label=y, group_id=df['group_id'])

# Set parameters for ranking task
params = {
    'objective': 'YetiRank',    # Rank objective (for ranking tasks)
    'eval_metric': 'NDCG',       # Use NDCG metric for evaluation
    'learning_rate': 0.05,  # Learning rate
    'iterations': 100,      # Number of boosting iterations
    'depth': 6,             # Depth of trees
    'verbose': 0            # Suppress CatBoost output
}

# Train the model
model = cb.train(train_data, params)

# Predict the ranks for the input data
y_pred = model.predict(X)

# Optionally convert the predicted scores to ranks (if needed)
ranked_indices = np.argsort(y_pred)[::-1]  # Sort in descending order to get ranks

# Now, we create the predicted ranks
predicted_ranks = np.zeros_like(y_pred, dtype=int)
predicted_ranks[ranked_indices] = np.arange(1, len(y_pred) + 1)

print("Predicted Ranks:", predicted_ranks)
print("Predicted Ranks Length:", len(predicted_ranks))

Predicted Ranks: [1292  965 1928 ...  346 2125 2241]
Predicted Ranks Length: 2392
