In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

from xgboost import XGBClassifier

import joblib, json, os


In [2]:
df = pd.read_csv("../data/processed_attendance.csv")
df


Unnamed: 0,student_id,date,subject,present,dept,semester,total_classes,total_present,overall_att,last7,last30,streak,week,trend,label
0,102,2024-01-10,Math,0,CSE,4,10,3,0.3,0.285714,0.3,1,2,0.047619,1
1,101,2024-01-10,Math,1,CSE,4,10,8,0.8,0.857143,0.8,0,2,0.285714,0
2,103,2024-01-10,Math,1,CSE,4,10,9,0.9,0.857143,0.9,0,2,-0.333333,0


In [3]:
feature_cols = ["overall_att", "last7", "last30", "streak", "trend"]

X = df[feature_cols].fillna(0)
y = df["label"]

X.head(), y.value_counts()


(   overall_att     last7  last30  streak     trend
 0          0.3  0.285714     0.3       1  0.047619
 1          0.8  0.857143     0.8       0  0.285714
 2          0.9  0.857143     0.9       0 -0.333333,
 label
 0    2
 1    1
 Name: count, dtype: int64)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,      # 30% test (change if you want)
    stratify=y,         # class ratio same rahe
    random_state=42
)

X_train.shape, X_test.shape


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.33,      # ~1 student test, 2 train
    random_state=42      # reproducible
    # stratify=y   # <-- isko hata diya
)

X_train.shape, X_test.shape, y_train, y_test


((2, 5),
 (1, 5),
 1    0
 2    0
 Name: label, dtype: int64,
 0    1
 Name: label, dtype: int64)

In [6]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [7]:
num_neg = (y_train == 0).sum()
num_pos = (y_train == 1).sum()

print("Safe (0):", num_neg)
print("Risk (1):", num_pos)

scale_pos_weight = num_neg / num_pos if num_pos > 0 else 1.0
scale_pos_weight


Safe (0): 2
Risk (1): 0


1.0

In [8]:
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight,
    n_jobs=-1,
    random_state=42
)

xgb_model.fit(X_train_scaled, y_train)


XGBoostError: [02:11:10] C:\actions-runner\_work\xgboost\xgboost\src\objective\regression_obj.cu:119: Check failed: is_valid: base_score must be in (0,1) for the logistic loss.

In [9]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()

log_model.fit(X_train_scaled, y_train)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(0)

In [10]:
# For tiny dummy data, use full dataset for training
X_train = X.copy()
y_train = y.copy()

# We'll still keep X_test, y_test same as train (just for demo)
X_test = X.copy()
y_test = y.copy()

X_train.shape, X_test.shape, y_train.value_counts()


((3, 5),
 (3, 5),
 label
 0    2
 1    1
 Name: count, dtype: int64)

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [12]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(X_train_scaled, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [13]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = log_model.predict(X_test_scaled)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=3))


Confusion Matrix:
[[2 0]
 [0 1]]

Classification Report:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000         2
           1      1.000     1.000     1.000         1

    accuracy                          1.000         3
   macro avg      1.000     1.000     1.000         3
weighted avg      1.000     1.000     1.000         3



In [14]:
import os, joblib, json

MODEL_DIR = "../model_artifacts"
os.makedirs(MODEL_DIR, exist_ok=True)

joblib.dump(log_model, os.path.join(MODEL_DIR, "attendance_log_model.pkl"))
joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))

with open(os.path.join(MODEL_DIR, "feature_cols.json"), "w") as f:
    json.dump(feature_cols, f)

print("Saved:", os.listdir(MODEL_DIR))


Saved: ['attendance_log_model.pkl', 'feature_cols.json', 'scaler.pkl']


In [15]:
model_artifacts ready

SyntaxError: invalid syntax (1634751334.py, line 1)