<h1>Import Library</h1>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

<h1>Baca Dataset</h1>

In [2]:
data_path = 'content/cleaned_data_kelulusan.csv'
df = pd.read_csv(data_path)

df.head()

Unnamed: 0,nim,nama,jk,tahun_masuk_d4,masa_studi,ips1,ips2,ips3,ips4,ips5,ips6,ips7,ipk8,status_cuti,lulus_tepat_waktu
0,13090001,IRFAN NUR YORITA,L,2013,4 tahun,3.57,3.75,3.29,3.59,3.75,3.5,3.62,3.6,Tidak,1
1,13090002,ARUM FATMAWATI,P,2013,4 tahun,3.48,3.75,3.43,3.73,3.85,3.7,3.85,3.66,Tidak,1
2,13090004,FAJAR RASIA ABADI,L,2013,4 tahun,3.48,3.65,3.62,3.77,3.75,3.15,4.0,3.63,Tidak,1
3,13090005,KHUSNUL MA'MUROH,P,2013,4 tahun,3.33,3.7,3.71,3.64,3.75,3.6,3.69,3.65,Tidak,1
4,13090007,IMAM MUNANDAR PRATAMA,L,2013,4 tahun,2.71,3.15,3.05,3.27,2.6,2.75,3.31,2.98,Tidak,1


<h1>Buang Kolom yang Tidak Diperlukan</h1>

In [3]:
df.drop(['nim', 'nama'], axis=1, inplace=True)

# Tampilkan nama kolom yang tersisa
df.columns

Index(['jk', 'tahun_masuk_d4', 'masa_studi', 'ips1', 'ips2', 'ips3', 'ips4',
       'ips5', 'ips6', 'ips7', 'ipk8', 'status_cuti', 'lulus_tepat_waktu'],
      dtype='object')

<h1>Preprocessing Kolom 'masa_studi'</h1>

In [4]:
if 'masa_studi' in df.columns:
    df['masa_studi'] = df['masa_studi'].astype(str).str.replace(' tahun', '', regex=False)
    df['masa_studi'] = pd.to_numeric(df['masa_studi'], errors='coerce')

# Tampilkan hasil setelah preprocessing
df['masa_studi'].head()


0    4.0
1    4.0
2    4.0
3    4.0
4    4.0
Name: masa_studi, dtype: float64

<h1>Tentukan Fitur dan Target</h1>

In [5]:
feature_cols = ['ips1', 'ips2', 'ips3', 'ips4', 'ips5', 'ips6', 'ips7', 'status_cuti']

target_col = 'lulus_tepat_waktu'

X = df[feature_cols].copy()
y = df[target_col].copy()

print("Shape X:", X.shape)
print("Shape y:", y.shape)


Shape X: (663, 8)
Shape y: (663,)


<h1>Tangani Missing Values</h1>

In [6]:
from sklearn.impute import SimpleImputer

categorical_cols = ['status_cuti']
numerical_cols = ['ips1', 'ips2', 'ips3', 'ips4', 'ips5', 'ips6', 'ips7']

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

X[numerical_cols] = num_imputer.fit_transform(X[numerical_cols])
X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])

X.head()


Unnamed: 0,ips1,ips2,ips3,ips4,ips5,ips6,ips7,status_cuti
0,3.57,3.75,3.29,3.59,3.75,3.5,3.62,Tidak
1,3.48,3.75,3.43,3.73,3.85,3.7,3.85,Tidak
2,3.48,3.65,3.62,3.77,3.75,3.15,4.0,Tidak
3,3.33,3.7,3.71,3.64,3.75,3.6,3.69,Tidak
4,2.71,3.15,3.05,3.27,2.6,2.75,3.31,Tidak


<h1>Encoding Kolom Kategoris</h1>

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

X.head()


Unnamed: 0,ips1,ips2,ips3,ips4,ips5,ips6,ips7,status_cuti
0,3.57,3.75,3.29,3.59,3.75,3.5,3.62,1
1,3.48,3.75,3.43,3.73,3.85,3.7,3.85,1
2,3.48,3.65,3.62,3.77,3.75,3.15,4.0,1
3,3.33,3.7,3.71,3.64,3.75,3.6,3.69,1
4,2.71,3.15,3.05,3.27,2.6,2.75,3.31,1


<h1>Split Data</h1>

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Data training:", X_train.shape)
print("Data testing:", X_test.shape)


Data training: (530, 8)
Data testing: (133, 8)


<h1>Scaling Kolom Numerik</h1>

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tampilkan beberapa data setelah scaling
pd.DataFrame(X_train_scaled).head()


Unnamed: 0,0,1,2,3,4,5,6,7
0,-1.321202,-1.859088,-0.760437,-1.560383,-0.588412,-1.137743,-2.794786,0.532253
1,-0.042353,0.927835,1.085891,0.678573,0.980395,-0.2375,-0.768257,0.532253
2,-2.633705,-2.277126,-1.480192,-1.130994,-0.845593,-1.243654,-1.547691,-1.878807
3,-0.210623,0.37045,-1.073374,0.249184,1.340449,0.768653,1.04003,0.532253
4,0.193224,-0.186934,0.14708,-0.45624,-0.356949,0.397965,1.04003,0.532253


<h1>Buat dan Latih Model SVM</h1>

In [10]:
from sklearn.model_selection import cross_val_score

svm_model = SVC(kernel='rbf', C=1.0, gamma='auto', random_state=42)

cross_val_scores = cross_val_score(svm_model, X_train_scaled, y_train, cv=5)

print("Cross-validation scores (5 folds):", cross_val_scores)
print("Mean cross-validation score:", cross_val_scores.mean())

svm_model.fit(X_train_scaled, y_train)

print("Model SVM telah dilatih.")


Cross-validation scores (5 folds): [1.         1.         1.         1.         0.99056604]
Mean cross-validation score: 0.9981132075471699
Model SVM telah dilatih.


<h1>Evaluasi Model</h1>

In [11]:
y_pred = svm_model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9924812030075187
Confusion Matrix:
 [[ 24   1]
 [  0 108]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.96      0.98        25
           1       0.99      1.00      1.00       108

    accuracy                           0.99       133
   macro avg       1.00      0.98      0.99       133
weighted avg       0.99      0.99      0.99       133



<h1>Simpan Model dan Scaler</h1>

In [12]:
joblib.dump(svm_model, 'model_trained/svm_model.pkl')
joblib.dump(scaler, 'model_trained/scaler.pkl')

print("Model dan scaler berhasil disimpan!")


Model dan scaler berhasil disimpan!
