# Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Data Card

# Load Por Dataset

In [2]:
por_df = pd.read_csv(r"C:\Users\tsarget\OneDrive\Desktop\student+performance\archive\student-por.csv" , sep = ";")
print("Shape: " , por_df.shape , "\n")
print(por_df.info())         # No nulls
por_df.head()

Shape:  (649, 33) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      649 non-null    object
 1   sex         649 non-null    object
 2   age         649 non-null    int64 
 3   address     649 non-null    object
 4   famsize     649 non-null    object
 5   Pstatus     649 non-null    object
 6   Medu        649 non-null    int64 
 7   Fedu        649 non-null    int64 
 8   Mjob        649 non-null    object
 9   Fjob        649 non-null    object
 10  reason      649 non-null    object
 11  guardian    649 non-null    object
 12  traveltime  649 non-null    int64 
 13  studytime   649 non-null    int64 
 14  failures    649 non-null    int64 
 15  schoolsup   649 non-null    object
 16  famsup      649 non-null    object
 17  paid        649 non-null    object
 18  activities  649 non-null    object
 19  nursery     649 non-null    ob

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [3]:
por_df.describe()    # All numeric columns data is consistent

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0
mean,16.744222,2.514638,2.306626,1.568567,1.930663,0.22188,3.930663,3.180277,3.1849,1.502311,2.280431,3.53621,3.659476,11.399076,11.570108,11.906009
std,1.218138,1.134552,1.099931,0.74866,0.82951,0.593235,0.955717,1.051093,1.175766,0.924834,1.28438,1.446259,4.640759,2.745265,2.913639,3.230656
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,2.0,0.0,10.0,10.0,10.0
50%,17.0,2.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,2.0,11.0,11.0,12.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,6.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,32.0,19.0,19.0,19.0


# Load Math Dataset

In [4]:
math_df = pd.read_csv(r"C:\Users\tsarget\OneDrive\Desktop\student+performance\archive\student_math.csv")
print("Shape: " , math_df.shape , "\n")
print(math_df.info())         # No nulls
math_df.head()

Shape:  (395, 33) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    ob

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [5]:
math_df.describe()    # All numeric columns data is consistent

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


# Preprocessing

### Check Dupliactes

In [6]:
print("Duplicates in por data: ",por_df.duplicated().sum())
print("Duplicates in math data: ",math_df.duplicated().sum())

Duplicates in por data:  0
Duplicates in math data:  0


### Drop unnecessary columns

In [7]:
por_df = por_df.drop(columns=['school'])
math_df = math_df.drop(columns=['school'])

### Add G3_risk column

In [8]:
def risk_category(g3):
    if g3 < 10:
        return 'High Risk'
    elif 10 <= g3 <= 13:
        return 'Medium Risk'
    else:
        return 'Low Risk'

por_df['G3_risk'] = por_df['G3'].apply(risk_category)
math_df['G3_risk'] = math_df['G3'].apply(risk_category)

### Label Encoding

In [9]:
# Mapping dictionaries
mappings = {
    "sex": {"M": 1, "F": 0},
    "address": {"U": 1, "R": 0},
    "famsize": {"LE3": 0, "GT3": 1},
    "Pstatus": {"T": 1, "A": 0},
    "schoolsup": {"yes": 1, "no": 0},
    "famsup": {"yes": 1, "no": 0},
    "paid": {"yes": 1, "no": 0},
    "activities": {"yes": 1, "no": 0},
    "nursery": {"yes": 1, "no": 0},
    "higher": {"yes": 1, "no": 0},
    "internet": {"yes": 1, "no": 0},
    "romantic": {"yes": 1, "no": 0},
    "G3_risk": {"Low Risk": 0, "Medium Risk": 1, "High Risk": 2}
}

# Apply encoding for por data set
for col, mapping in mappings.items():
    por_df[col] = por_df[col].map(mapping)
    

# Apply encoding for math data set
for col, mapping in mappings.items():
    math_df[col] = math_df[col].map(mapping)    

### One-hot Encoding

In [10]:
# Columns to one-hot encode
onehot_cols = ['Mjob', 'Fjob', 'reason', 'guardian']

# One-hot encode and concat to the same DataFrame
por_df = pd.concat(
    [por_df, pd.get_dummies(por_df[onehot_cols], prefix=onehot_cols)], 
    axis=1
)

# Same for math dataset
math_df = pd.concat(
    [math_df, pd.get_dummies(math_df[onehot_cols], prefix=onehot_cols)], 
    axis=1
)

print("Portuguese df shape:", por_df.shape)
print("Math df shape:", math_df.shape)

Portuguese df shape: (649, 50)
Math df shape: (395, 50)


# Feature Engineering

## Create meaningful features maybe we need it

In [11]:
por_df['G1_G2_avg'] = (por_df['G1'] + por_df['G2'])/2
math_df['G1_G2_avg'] = (math_df['G1'] + math_df['G2'])/2

## Correlation

In [12]:
# Specific columns of interest
cols = [
    "age", "Medu", "Fedu", "traveltime", "studytime", "failures",
    "famrel", "freetime", "goout", "Dalc", "Walc",
    "health", "absences", "G1", "G2", "G1_G2_avg", "G3"
]

# Correlation matrix
corr_matrix = por_df[cols].corr()

# Print
corr_matrix

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G1_G2_avg,G3
age,1.0,-0.107832,-0.12105,0.03449,-0.008415,0.319968,-0.020559,-0.00491,0.112805,0.134768,0.086357,-0.00875,0.149998,-0.174322,-0.107119,-0.144686,-0.106505
Medu,-0.107832,1.0,0.647477,-0.265079,0.097006,-0.17221,0.024421,-0.019686,0.009536,-0.007018,-0.019766,0.004614,-0.008577,0.260472,0.264035,0.271627,0.240151
Fedu,-0.12105,0.647477,1.0,-0.208288,0.0504,-0.165915,0.020256,0.006841,0.02769,6.1e-05,0.038445,0.04491,0.029859,0.217501,0.225139,0.229302,0.2118
traveltime,0.03449,-0.265079,-0.208288,1.0,-0.063154,0.09773,-0.009521,0.000937,0.057454,0.092824,0.057007,-0.048261,-0.008149,-0.15412,-0.154489,-0.159793,-0.127173
studytime,-0.008415,0.097006,0.0504,-0.063154,1.0,-0.147441,-0.004127,-0.068829,-0.075442,-0.137585,-0.214925,-0.056433,-0.118389,0.260875,0.240498,0.25928,0.249789
failures,0.319968,-0.17221,-0.165915,0.09773,-0.147441,1.0,-0.062645,0.108995,0.045078,0.105949,0.082266,0.035588,0.122779,-0.38421,-0.385782,-0.398701,-0.393316
famrel,-0.020559,0.024421,0.020256,-0.009521,-0.004127,-0.062645,1.0,0.129216,0.089707,-0.075767,-0.093511,0.109559,-0.089534,0.048795,0.089588,0.072278,0.063361
freetime,-0.00491,-0.019686,0.006841,0.000937,-0.068829,0.108995,0.129216,1.0,0.346352,0.109904,0.120244,0.084526,-0.018716,-0.094497,-0.106678,-0.104349,-0.122705
goout,0.112805,0.009536,0.02769,0.057454,-0.075442,0.045078,0.089707,0.346352,1.0,0.245126,0.38868,-0.015741,0.085374,-0.074053,-0.079469,-0.079572,-0.087641
Dalc,0.134768,-0.007018,6.1e-05,0.092824,-0.137585,0.105949,-0.075767,0.109904,0.245126,1.0,0.616561,0.059067,0.172952,-0.195171,-0.18948,-0.199072,-0.204719


## Chi ²

###### Math Dataset

In [13]:
# Drop numeric/target features
exclude_cols = ['age', 'failures', 'absences', 'G1', 'G2', 'G3', 'G1_G2_avg', 'G3_risk']
cols = [c for c in math_df.columns if c not in exclude_cols]

results = []
for col in cols:
    table = pd.crosstab(math_df[col], math_df['G3_risk'])
    chi2, p, dof, _ = chi2_contingency(table)
    if p < 0.05:  # keep only significant
        results.append((col, chi2, p))

# Collect into DataFrame
chi2_df = pd.DataFrame(results, columns=['Feature', 'Chi2', 'p-value'])

print("Features significantly related to G3_risk:")
chi2_df

Features significantly related to G3_risk:


Unnamed: 0,Feature,Chi2,p-value
0,address,6.657565,0.035837
1,Medu,33.831965,4.4e-05
2,Mjob,15.765301,0.045865
3,studytime,13.131051,0.041001
4,schoolsup,12.115411,0.00234
5,higher,12.155626,0.002293
6,goout,17.474618,0.025529
7,Dalc,17.430046,0.02593
8,Mjob_health,7.284744,0.02619
9,Fjob_teacher,11.683189,0.002904


###### Por Dataset

In [14]:
# Drop numeric/target features
exclude_cols = ['age', 'failures', 'absences', 'G1', 'G2', 'G3', 'G1_G2_avg', 'G3_risk']
cols = [c for c in por_df.columns if c not in exclude_cols]

results = []
for col in cols:
    table = pd.crosstab(por_df[col], por_df['G3_risk'])
    chi2, p, dof, _ = chi2_contingency(table)
    if p < 0.05:  # keep only significant
        results.append((col, chi2, p))

# Collect into DataFrame
chi2_df = pd.DataFrame(results, columns=['Feature', 'Chi2', 'p-value'])

print("Features significantly related to G3_risk:")
chi2_df

Features significantly related to G3_risk:


Unnamed: 0,Feature,Chi2,p-value
0,sex,8.200132,0.01657158
1,address,14.232568,0.0008117778
2,Medu,47.984697,9.946303e-08
3,Fedu,34.950977,2.729387e-05
4,Mjob,31.783493,0.0001018346
5,reason,29.874201,4.153315e-05
6,studytime,39.405307,5.96047e-07
7,schoolsup,20.414454,3.690265e-05
8,paid,6.299846,0.04285542
9,higher,73.778138,9.534124000000001e-17


## Data Splitting

In [15]:
target = "G3_risk"

freq_encode_cols = ['Mjob', 'Fjob', 'reason', 'guardian']

scaling_features = ['age', 'Medu', 'Fedu', 'Mjob',
       'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures',
       'famrel', 'freetime', 'goout', 'Dalc', 'Walc',
       'health', 'absences', 'G1', 'G2','G1_G2_avg']

por_df.columns

Index(['sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob',
       'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures',
       'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher',
       'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc',
       'health', 'absences', 'G1', 'G2', 'G3', 'G3_risk', 'Mjob_at_home',
       'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher',
       'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services',
       'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other',
       'reason_reputation', 'guardian_father', 'guardian_mother',
       'guardian_other', 'G1_G2_avg'],
      dtype='object')

###### Math Dataset

In [16]:
math_features = [c for c in math_df.columns if c not in [target, 'G3']]

X_math = math_df[math_features]
y_math = math_df[target]

x_math_train, x_math_test, y_math_train, y_math_test = train_test_split(
    X_math, y_math, test_size=0.2, random_state=42, stratify=y_math
)

###### Por Dataset

In [17]:
por_features = [c for c in por_df.columns if c not in [target, 'G3']]

X_por = por_df[por_features]
y_por = por_df[target]

x_por_train, x_por_test, y_por_train, y_por_test = train_test_split(
    X_por, y_por, test_size=0.2, random_state=42, stratify=y_por
)

# Model Selection

In [22]:
# 2. Frequency Encoding (in-place)
# -------------------------
encoder = ce.CountEncoder(cols=freq_encode_cols, normalize=True)

x_math_train[freq_encode_cols] = encoder.fit_transform(x_math_train[freq_encode_cols])
x_math_test[freq_encode_cols] = encoder.transform(x_math_test[freq_encode_cols])

# -------------------------
# 3. Scaling (apply to ALL features, will be used only for linear models)
# -------------------------
scaler = MinMaxScaler()

x_math_train_scaled = pd.DataFrame(
    scaler.fit_transform(x_math_train),
    columns=x_math_train.columns,
    index=x_math_train.index
)

x_math_test_scaled = pd.DataFrame(
    scaler.transform(x_math_test),
    columns=x_math_test.columns,
    index=x_math_test.index
)

# -------------------------
# 4. Choose features (optional filter)
# -------------------------
linear_features = x_math_train_scaled.columns.tolist()  # all features
tree_features = x_math_train.columns.tolist()           # all features

# -------------------------
# 5. Models
# -------------------------
models_linear = {
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=42),
    "SVM": SVC(kernel="linear", random_state=42)
}

models_tree = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

# -------------------------
# 6. Train & Evaluate
# -------------------------
print("🔹 Linear Models (scaled data)")
for name, model in models_linear.items():
    model.fit(x_math_train_scaled[linear_features], y_math_train)
    preds = model.predict(x_math_test_scaled[linear_features])
    print(f"{name}: {accuracy_score(y_math_test, preds):.4f}")

print("\n🔹 Tree Models (encoded only, not scaled)")
for name, model in models_tree.items():
    model.fit(x_math_train[tree_features], y_math_train)
    preds = model.predict(x_math_test[tree_features])
    print(f"{name}: {accuracy_score(y_math_test, preds):.4f}")


Decision Tree: 0.8354
Random Forest: 0.8354
Logistic Regression: 0.7848
SVM: 0.6835


In [23]:

# -------------------------------
# Models
# -------------------------------
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=42),
    "SVM": SVC(random_state=42)
}

# -------------------------------
# Train & evaluate
# -------------------------------
for name, model in models.items():
    pipe = Pipeline(steps=[
        ("encode", ce.CountEncoder(cols=freq_encode_cols, normalize=True)),
        ("scale", MinMaxScaler()),
        ("classifier", model)
    ])

    pipe.fit(x_math_train, y_math_train)
    preds = pipe.predict(x_math_test)
    acc = accuracy_score(y_math_test, preds)
    print(f"{name}: {acc:.4f}")

Decision Tree: 0.8608
Random Forest: 0.8481
Logistic Regression: 0.7848
SVM: 0.6835
