# Orbit Future Academy
Ahmad Ramadhan\
2020410012\
ID Kegiatan Kampus  Merdeka: 9338023\
Kelas Code

python 3.11.9

In [71]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

## Data Processing

In [35]:
DATASET_PATH='dataset/02_payment_20170101_20191231.xlsx'
df=pd.read_excel(DATASET_PATH)

delete duplicate

In [36]:
df.duplicated().sum()

15

Menghapus Duplicate baris

In [37]:
df = df.drop_duplicates(keep='first')
df.duplicated().sum()

0

Menghapus currency USD

In [38]:
df = df[df['Currency'] != 'USD']

Menghapus kolom yang tidak perlu

In [39]:
df.drop([
    'payment_voucher',
    'voucher',
    'booking_date',
    'COB',
    'payment_OC',
    'policyno',
    'INSTALLMENT',
    'DUEDATE_1',
    'DUEDATE_2',
    'DUEDATE_3',
    'DUEDATE_4',
    'DUEDATE_5',
    'DUEDATE_6',
    'DUEDATE_7',
    'DUEDATE_8',
    'DUEDATE_9',
    'DUEDATE_10',
    'DUEDATE_11',
    'DUEDATE_12',
    'AMOUNTDUE_1',
    'AMOUNTDUE_2',
    'AMOUNTDUE_3',
    'AMOUNTDUE_4',
    'AMOUNTDUE_5',
    'AMOUNTDUE_6',
    'AMOUNTDUE_7',
    'AMOUNTDUE_8',
    'AMOUNTDUE_9',
    'AMOUNTDUE_10',
    'AMOUNTDUE_11',
    'AMOUNTDUE_12',
    'Currency'
    ], axis=1, inplace=True)

Membuat kolom jumlah hari tunggakkan

In [40]:
df['pdate'] = pd.to_datetime(df['pdate'])
df['BILLDATE'] = pd.to_datetime(df['BILLDATE'])

df['days_late'] = (df['BILLDATE'] - df['pdate']).dt.days

## Membuat klasifikasi kualitas kredit pada setiap transaksi
Status kolektibilitas dalam dunia perbankan:
- Kol-1 (LANCAR) = 0 (Pembayaran Tepat Waktu)
- Kol-2 (DALAM PERHATIAN KHUSUS) = 1 (Terlambat 1-90 hari sejak tanggal jatuh tempo)
- Kol-3 (KURANG LANCAR) = 2 (Terlambat 91-120 hari sejak tanggal jatuh tempo)
- Kol-4 (DIRAGUKAN) = 3 (Terlambat 121-180 hari sejak tanggal jatuh tempo)
- Kol-5 (MACET) = 4 (Terlambat lebih dari 180 hari sejak tanggal jatuh tempo)

In [41]:
def kolektibilitas(days_late):
    if days_late >= 0:
      return 0
    elif days_late >= -90:
      return 1
    elif days_late >= -120:
      return 2
    elif days_late >= -180:
      return 3
    else:
      return 4

In [42]:
df['kolektibilitas'] = df['days_late'].apply(kolektibilitas)

In [43]:
df.head()

Unnamed: 0,Branch,pdate,BILLDATE,Nominal,days_late,kolektibilitas
0,5,2017-06-05,2017-04-07,3076131.0,-59,1
1,5,2017-06-05,2017-05-24,3676000.0,-12,1
2,5,2017-06-05,2017-05-04,2965025.0,-32,1
3,17,2017-06-09,2017-05-30,3130500.0,-10,1
4,3,2017-06-08,2017-04-10,103000.0,-59,1


## Export to csv

In [45]:
CLEANED_DATASET = "dataset/cleaned_dataset.csv"

In [46]:
df.to_csv(CLEANED_DATASET, index=False)

# Load new dataset

In [47]:
new_df = pd.read_csv(CLEANED_DATASET, low_memory=False)

# Machine learning

In [51]:
X = new_df[['days_late']]
y = new_df['kolektibilitas']

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Normalisasi feature X

In [53]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Inisialisasi model

In [56]:
models = {
    "RandomForestClassifier": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(random_state=42),
    "SVC": SVC(random_state=42)
}

## Mengecek model terbaik

In [65]:
results = {}

best_model = {
    "score":0
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    total_score = (accuracy + precision + recall + f1) / 4
    
    results[name] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "Total Score (Avg)": total_score
    }

    if total_score > best_model['score']:
        best_model['score'] = total_score
        best_model['model'] = model
        best_model['name'] = name

### Menampilkan hasil evaluasi

In [66]:
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,accuracy,precision,recall,f1_score,Total Score (Avg)
RandomForestClassifier,1.0,1.0,1.0,1.0,1.0
LogisticRegression,0.99544,0.991175,0.983535,0.987322,0.989368
SVC,0.963353,0.961178,0.987212,0.972185,0.970982


In [67]:
print(f"Model Terbaik adalah: {best_model['name']} dengan score: {best_model['score']}")

Model Terbaik adalah: RandomForestClassifier dengan score: 1.0


## Fit Model dari best model

In [68]:
model = best_model['model']
model.fit(X_train_scaled, y_train)

### Perbandingan prediksi dan data sebenarnya

In [70]:
y_pred = model.predict(X_test_scaled)
data_pred = pd.DataFrame({'y_pred': y_pred, 'y_test': y_test})

# Display the DataFrame
data_pred

Unnamed: 0,y_pred,y_test
3495,0,0
33114,1,1
54460,0,0
44129,1,1
56466,2,2
...,...,...
2495,3,3
59672,1,1
36241,1,1
18295,0,0


### Save Model

In [72]:
MODEL = "models/model.pkl"
joblib.dump((scaler, model), MODEL)

['src/models/model.pkl']

### Test Predict Model

In [73]:
scaler_load, model_load = joblib.load(MODEL)

In [109]:
def predict_kolektibilitas(scaler, model, days_late):  
    if isinstance(days_late, (int, float)):
        days_late = np.array([[days_late]])
    elif isinstance(days_late, list):
        days_late = np.array(days_late).reshape(-1, 1)
    else:
        days_late = days_late.reshape(-1, 1)

    days_late_df = pd.DataFrame(days_late, columns=['days_late'])

    days_late_scaled = scaler.transform(days_late_df)
    predictions = model.predict(days_late_scaled)
    return predictions

In [110]:
predict = predict_kolektibilitas(scaler_load, model_load, [-91,1])
print(predict)

[2 0]
