In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Persiapan Data
# Membuat dataset sintetis untuk klasifikasi
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Penjelasan:
# Dataset terdiri dari 1000 sampel dengan 20 fitur (10 informatif dan 5 redundan). Data dibagi menjadi data latih (70%) dan data uji (30%).

# 2. Membuat Pipeline Sederhana
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression())
])

# Melatih pipeline pada data latih
pipeline.fit(X_train, y_train)

# Evaluasi pada data uji
y_pred = pipeline.predict(X_test)
print("\nHasil Pipeline Logistic Regression:")
print("Akurasi:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Penjelasan:
# Pipeline ini menggabungkan standardisasi data dan model Logistic Regression dalam satu langkah proses.

# 3. Pipeline dengan Polynomial Features
poly_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('logreg', LogisticRegression(max_iter=500))
])

# Melatih pipeline dengan fitur polinomial
poly_pipeline.fit(X_train, y_train)
y_poly_pred = poly_pipeline.predict(X_test)

print("\nHasil Pipeline Logistic Regression dengan Polynomial Features:")
print("Akurasi:", accuracy_score(y_test, y_poly_pred))
print(classification_report(y_test, y_poly_pred))

# Penjelasan:
# Pipeline ini menambahkan fitur polinomial untuk menangkap hubungan non-linear dalam data.

# 4. Grid Search dengan Pipeline
param_grid = {
    'scaler': [StandardScaler()],
    'logreg__C': [0.1, 1, 10],
    'logreg__penalty': ['l2']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("\nHasil GridSearchCV:")
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Penjelasan:
# GridSearchCV mengoptimalkan parameter pipeline, seperti regularisasi Logistic Regression, untuk meningkatkan akurasi.

# 5. Pipeline dengan Random Forest
rf_pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(random_state=42)
)

# Melatih pipeline Random Forest
rf_pipeline.fit(X_train, y_train)
y_rf_pred = rf_pipeline.predict(X_test)

print("\nHasil Pipeline Random Forest:")
print("Akurasi:", accuracy_score(y_test, y_rf_pred))
print(classification_report(y_test, y_rf_pred))

# Kesimpulan
print("\nKesimpulan:")
print("1. Pipeline menyederhanakan alur preprocessing dan pelatihan model menjadi satu kesatuan.")
print("2. Penambahan fitur polinomial dapat meningkatkan performa pada data dengan hubungan non-linear.")
print("3. GridSearchCV membantu mengoptimalkan hyperparameter dalam pipeline untuk performa terbaik.")
print("4. Pipeline dapat digunakan dengan berbagai jenis model, termasuk Logistic Regression dan Random Forest.")



Hasil Pipeline Logistic Regression:
Akurasi: 0.8366666666666667
              precision    recall  f1-score   support

           0       0.88      0.81      0.84       160
           1       0.80      0.87      0.83       140

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300


Hasil Pipeline Logistic Regression dengan Polynomial Features:
Akurasi: 0.9
              precision    recall  f1-score   support

           0       0.90      0.91      0.91       160
           1       0.90      0.89      0.89       140

    accuracy                           0.90       300
   macro avg       0.90      0.90      0.90       300
weighted avg       0.90      0.90      0.90       300


Hasil GridSearchCV:
Best Parameters: {'logreg__C': 1, 'logreg__penalty': 'l2', 'scaler': StandardScaler()}
Best Cross-Validation Score: 0.8428571428571429

Hasil Pipeline Random Forest:
Akurasi: 0.916666