In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
import joblib
import os


# -------------------------------------------
# 1. Load Data dari CSV
# -------------------------------------------
data_path = 'content/cleaned_data_kelulusanTI.csv'
df = pd.read_csv(data_path)
print(df.head())
print(df.info())
# -------------------------------------------
# 2. Membuat Kolom Target 'lulus_tepat_waktu'
# -------------------------------------------
# Kriteria:
# - Jika Masa Studi <= 4, maka dianggap lulus tepat waktu (1), jika lebih dianggap tidak (0)
df['lulus_tepat_waktu'] = (df['Masa Studi'] <= 4).astype(int)

# -------------------------------------------
# 3. Membuat Label encoding untuk kolom object
# -------------------------------------------
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])
# -------------------------------------------
# 4. Memilih Fitur yang Akan Digunakan (IPS1 sampai IPS7)
# -------------------------------------------
# Ambil fitur IPS dan label (ganti sesuai nama kolom label)
feature_columns = [ 'IPS1', 'IPS2', 'IPS3', 'IPS4', 'IPS5', 'IPS6', 'IPS7', 'Status Bekerja',
        'Keikutsertaan Organisasi']

X = df[feature_columns]
y = df['status_kelulusan']

# Hitung batas minimum IPS (Q1)
df_lulus = df[df['lulus_tepat_waktu'] == 1]
min_ips_lulus = df_lulus[feature_columns].min().to_dict()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
dtr = DecisionTreeClassifier(max_depth=4, random_state=42)
dtr.fit(X_train_scaled, y_train)

# Prediksi dan evaluasi
y_pred = dtr.predict(X_test_scaled)
print("Akurasi:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


# Simpan model, scaler, dan min_ips_lulus
os.makedirs("model_trained", exist_ok=True)
joblib.dump(dtr, "model_trained/decision_tree_model.pkl")
joblib.dump(scaler, "model_trained/scaler.pkl")
joblib.dump(min_ips_lulus, "model_trained/min_ips_lulus.pkl")
print("Model, scaler, dan batas IPS berhasil disimpan.")

  Asal Sekolah Jurusan  Masa Studi  Usia pada saat Lulus D4  IPS1  IPS2  IPS3  \
0          NaN     NaN         4.0                     23.0  3.57  3.75  3.29   
1          NaN     NaN         4.0                     23.0  3.48  3.75  3.43   
2          NaN     NaN         4.0                     31.0  3.48  3.65  3.62   
3          NaN     NaN         4.0                     24.0  3.33  3.70  3.71   
4          NaN     NaN         4.0                     22.0  2.71  3.15  3.05   

   IPS4  IPS5  IPS6  IPS7  Status Cuti  Status Bekerja  \
0  3.59  3.75  3.50  3.62            0             NaN   
1  3.73  3.85  3.70  3.85            0             NaN   
2  3.77  3.75  3.15  4.00            0             1.0   
3  3.64  3.75  3.60  3.69            0             NaN   
4  3.27  2.60  2.75  3.31            0             NaN   

   Keikutsertaan Organisasi  status_kelulusan  
0                       0.0                 1  
1                       1.0                 1  
2                   