# Import Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load Data

In [2]:
df = pd.read_csv('data/wbc.csv')

# 1. Pisahkan antara variabel yang dapat digunakan dan variabel yang tidak dapat digunakan.

In [3]:
df = df.drop(['id', 'Unnamed: 32'], axis=1)

# 2. Lakukan proses encoding pada kolom "diagnosis".

In [4]:
le = LabelEncoder()
df['diagnosis'] = le.fit_transform(df['diagnosis'])

## Pisahkan fitur Y dari X

In [5]:
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']


# 3. Lakukan proses standardisasi pada semua kolom yang memiliki nilai numerik.

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# 4. Lakukan proses seleksi fitur. Anda dapat menggunakan SelectKBest.

## Split data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

## Seleksi Fitur dengan SelectKBest

In [8]:
selector = SelectKBest(score_func=f_classif, k=10)  # contoh k=10
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)
selected_mask = selector.get_support()
selected_features = X.columns[selected_mask].tolist()
print('Fitur terpilih:', selected_features)

Fitur terpilih: ['radius_mean', 'perimeter_mean', 'area_mean', 'concavity_mean', 'concave points_mean', 'radius_worst', 'perimeter_worst', 'area_worst', 'concavity_worst', 'concave points_worst']


# 5. Lakukan proses pengujian dengan model Logistic Regression seperti pada praktikum 1.

# 6. Anda dapat menggunakan model pipeline untuk mempermudah perkejaan Anda.

In [14]:
pipe = Pipeline([
    ('select', SelectKBest(score_func=f_classif, k=20)),
    ('clf', LogisticRegression(max_iter=1000))
])
pipe.fit(X_train, y_train)
y_pred_pipe = pipe.predict(X_test)
print("=== Filter (ANOVA) + LR ===")
print("Accuracy:", accuracy_score(y_test, y_pred_pipe))
print(classification_report(y_test, y_pred_pipe))


=== Filter (ANOVA) + LR ===
Accuracy: 0.9912280701754386
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        72
           1       1.00      0.98      0.99        42

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114



# 7. Berdasarkan hasil analisa Anda, berapa jumlah fitur terbaik yang dapat digunakan? Apa saja fitur tersebut?

In [10]:
best_acc = 0
best_k = 0
best_features = []
for k in range(5, X.shape[1]+1, 5):
    pipe_k = Pipeline([
        ('select', SelectKBest(score_func=f_classif, k=k)),
        ('clf', LogisticRegression(max_iter=1000))
    ])
    pipe_k.fit(X_train, y_train)
    y_pred_k = pipe_k.predict(X_test)
    acc = accuracy_score(y_test, y_pred_k)
    if acc > best_acc:
        best_acc = acc
        best_k = k
        mask = pipe_k.named_steps['select'].get_support()
        best_features = X.columns[mask].tolist()
    print(f'Jumlah fitur: {k}, Akurasi: {acc}')
print('\n=== Kesimpulan ===')
print(f'Jumlah fitur terbaik: {best_k}')
print('Nama fitur terbaik:', best_features)
print(f'Akurasi terbaik: {best_acc}')




Jumlah fitur: 5, Akurasi: 0.9649122807017544
Jumlah fitur: 10, Akurasi: 0.9649122807017544
Jumlah fitur: 15, Akurasi: 0.9736842105263158
Jumlah fitur: 20, Akurasi: 0.9912280701754386
Jumlah fitur: 25, Akurasi: 0.9736842105263158
Jumlah fitur: 30, Akurasi: 0.9649122807017544

=== Kesimpulan ===
Jumlah fitur terbaik: 20
Nama fitur terbaik: ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'radius_se', 'perimeter_se', 'area_se', 'concave points_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst']
Akurasi terbaik: 0.9912280701754386
