In [None]:
# 1. Setup
!pip install ucimlrepo xgboost
import os, pickle, pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score,
                             recall_score, f1_score, matthews_corrcoef)

# 2. Create directory
if not os.path.exists('model'): os.makedirs('model')

# 3. Data Prep
dataset = fetch_ucirepo(id=697)
X, y = dataset.data.features, dataset.data.targets
X = X.fillna(X.median())

le = LabelEncoder()
y_encoded = le.fit_transform(y.values.ravel())
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save transformers for Streamlit app
with open('model/scaler.pkl', 'wb') as f: pickle.dump(scaler, f)
with open('model/encoder.pkl', 'wb') as f: pickle.dump(le, f)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42
)

# 4. Train 6 Models
models = {
    "Logistic_Regression": LogisticRegression(max_iter=1000),
    "Decision_Tree": DecisionTreeClassifier(max_depth=10),
    "kNN": KNeighborsClassifier(),
    "Naive_Bayes": GaussianNB(),
    "Random_Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier()
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    with open(f'model/{name}.pkl', 'wb') as f: pickle.dump(model, f)

    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)
        auc = roc_auc_score(y_test, y_prob, multi_class='ovr') if y_prob.shape[1] > 2 else roc_auc_score(y_test, y_prob[:, 1])
    else:
        auc = 0.0

    # Calculate 6 required metrics
    results.append({
        "ML Model Name": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": auc,
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1": f1_score(y_test, y_pred, average='weighted'),
        "MCC": matthews_corrcoef(y_test, y_pred)
    })

# 5. Display Table for README
df_results = pd.DataFrame(results)
print("\n--- FINAL COMPARISON TABLE FOR README ---")
print(df_results.to_string(index=False))

# 6. EXPORT TEST CSV FOR STREAMLIT UPLOAD
# Use inverse transform to get original values for the CSV
test_data_export = pd.DataFrame(scaler.inverse_transform(X_test), columns=X.columns)
test_data_export['Target'] = le.inverse_transform(y_test)
test_data_export.to_csv('test_data.csv', index=False)
print("\nSuccess! 'test_data.csv' created for Streamlit upload.")

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7

--- FINAL COMPARISON TABLE FOR README ---
      ML Model Name  Accuracy      AUC  Precision   Recall       F1      MCC
Logistic_Regression  0.752542 0.869383   0.734083 0.752542 0.736308 0.593230
      Decision_Tree  0.716384 0.768441   0.710379 0.716384 0.708616 0.537111
                kNN  0.698305 0.791192   0.689086 0.698305 0.687903 0.504966
        Naive_Bayes  0.692655 0.801522   0.680850 0.692655 0.679823 0.494250
      Random_Forest  0.766102 0.875226   0.750497 0.766102 0.749871 0.616348
            XGBoost  0.763842 0.875032   0.754142 0.763842 0.755302 0.613874

Success! 'test_data.csv' created for Streamlit upload.


In [None]:
# Extract high-level metadata for README section b
print("--- DATASET DESCRIPTION (FOR README) ---")
print(f"Dataset Name: {dataset.metadata.name}")
print(f"Problem Type: {dataset.metadata.additional_info.get('variable_info', 'Classification')}")
print(f"Number of Instances: {dataset.data.features.shape[0]}")
print(f"Number of Features: {dataset.data.features.shape[1]}")
print(f"Target Variable: {dataset.data.targets.columns[0]}")
print(f"Class Distribution:\n{y.value_counts()}")
print("-" * 40)

--- DATASET DESCRIPTION (FOR README) ---
Dataset Name: Predict Students' Dropout and Academic Success
Problem Type: None
Number of Instances: 4424
Number of Features: 36
Target Variable: Target
Class Distribution:
Target  
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64
----------------------------------------
