In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/DATA MINING/datakelulusanmahasiswa.csv', sep=';')

In [None]:
dataset.columns = dataset.columns.str.strip()

print("Dataset berhasil dimuat")
print("Jumlah baris & kolom:", dataset.shape)
print("\n Nama kolom:")
print(dataset.columns.tolist())

Dataset berhasil dimuat
Jumlah baris & kolom: (379, 15)

 Nama kolom:
['NAMA', 'JENIS KELAMIN', 'STATUS MAHASISWA', 'UMUR', 'STATUS NIKAH', 'IPS 1', 'IPS 2', 'IPS 3', 'IPS 4', 'IPS 5', 'IPS 6', 'IPS 7', 'IPS 8', 'IPK', 'STATUS KELULUSAN']


In [None]:
display(dataset.head())

Unnamed: 0,NAMA,JENIS KELAMIN,STATUS MAHASISWA,UMUR,STATUS NIKAH,IPS 1,IPS 2,IPS 3,IPS 4,IPS 5,IPS 6,IPS 7,IPS 8,IPK,STATUS KELULUSAN
0,ANIK WIDAYANTI,PEREMPUAN,BEKERJA,28,BELUM MENIKAH,276,28,32,317,298,3,303,0,307,TERLAMBAT
1,DWI HESTYNA PRIHASTANTY,PEREMPUAN,MAHASISWA,32,BELUM MENIKAH,3,33,314,314,284,313,325,0,317,TERLAMBAT
2,MURYA ARIEF BASUKI,PEREMPUAN,BEKERJA,29,BELUM MENIKAH,35,33,37,329,353,372,373,0,354,TERLAMBAT
3,NANIK SUSANTI,PEREMPUAN,MAHASISWA,27,BELUM MENIKAH,317,341,361,336,348,363,346,0,341,TERLAMBAT
4,RIFKA ISTIQFARINA,PEREMPUAN,BEKERJA,29,BELUM MENIKAH,29,289,33,285,298,3,308,0,309,TERLAMBAT


In [None]:
print("Info Dataset:")
dataset.info()

print("\nJumlah Missing Value per Kolom:")
print(dataset.isnull().sum())

Info Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 379 entries, 0 to 378
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   NAMA              379 non-null    object
 1   JENIS KELAMIN     379 non-null    object
 2   STATUS MAHASISWA  379 non-null    object
 3   UMUR              379 non-null    int64 
 4   STATUS NIKAH      379 non-null    object
 5   IPS 1             379 non-null    object
 6   IPS 2             379 non-null    object
 7   IPS 3             379 non-null    object
 8   IPS 4             379 non-null    object
 9   IPS 5             379 non-null    object
 10  IPS 6             379 non-null    object
 11  IPS 7             379 non-null    object
 12  IPS 8             372 non-null    object
 13  IPK               376 non-null    object
 14  STATUS KELULUSAN  379 non-null    object
dtypes: int64(1), object(14)
memory usage: 44.5+ KB

Jumlah Missing Value per Kolom:
NAMA          

In [None]:
num_cols = dataset.select_dtypes(include=['number']).columns
if len(num_cols) > 0:
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    dataset[num_cols] = imputer.fit_transform(dataset[num_cols])
    print("\nMissing value pada kolom numerik sudah diisi dengan nilai rata-rata.")
else:
    print("\nTidak ada kolom numerik untuk imputasi.")


Missing value pada kolom numerik sudah diisi dengan nilai rata-rata.


In [None]:
if 'STATUS KELULUSAN' not in dataset.columns:
    raise KeyError("Kolom 'STATUS KELULUSAN' tidak ditemukan. Cek kembali nama kolom di dataset kamu!")

X = dataset.drop(columns=['STATUS KELULUSAN'])
y = dataset['STATUS KELULUSAN']

In [None]:
# Jika X sudah jadi array (hasil encoding), lewati proses konversi IPS/IPK
if isinstance(X, pd.DataFrame):
    cols_to_convert_to_float = ['IPS 1','IPS 2','IPS 3','IPS 4','IPS 5','IPS 6','IPS 7','IPS 8','IPK']
    for col in cols_to_convert_to_float:
        if col in X.columns and X[col].dtype == 'object':
            X[col] = X[col].str.replace(',', '.', regex=False).astype(float)
else:
    print("X sudah berbentuk array hasil encoding, bagian konversi IPS/IPK dilewati.")

X sudah berbentuk array hasil encoding, bagian konversi IPS/IPK dilewati.


In [None]:
le = LabelEncoder()
y = le.fit_transform(y)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

print("\nDataset dibagi menjadi data latih dan data uji.")
print("Jumlah data latih:", X_train.shape[0])
print("Jumlah data uji:", X_test.shape[0])


Dataset dibagi menjadi data latih dan data uji.
Jumlah data latih: 303
Jumlah data uji: 76


In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train.astype(float))
X_test = sc.transform(X_test.astype(float))
print("\nStandarisasi fitur selesai.")



Standarisasi fitur selesai.


In [None]:
print("\nHASIL AKHIR:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

print("\nPreprocessing selesai tanpa error.")



HASIL AKHIR:
X_train shape: (303, 388)
X_test shape: (76, 388)
y_train shape: (303,)
y_test shape: (76,)

Preprocessing selesai tanpa error.
