In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
df = pd.read_csv('wbc.csv')

# Tampilkan informasi dataset
print("Shape dataset:", df.shape)
print("\n5 baris pertama:")
print(df.head())
print("\nInformasi kolom:")
print(df.info())
print("\nStatistik deskriptif:")
print(df.describe())

Shape dataset: (569, 33)

5 baris pertama:
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_wor

In [3]:
# Pisahkan variabel
# Variabel yang tidak dapat digunakan
unusable_features = ['id']

# Variabel yang dapat digunakan
usable_features = [col for col in df.columns if col not in unusable_features]

print("Variabel yang TIDAK dapat digunakan:")
print(unusable_features)
print(f"\nJumlah: {len(unusable_features)}")

print("\n" + "="*50)
print("\nVariabel yang DAPAT digunakan:")
print(usable_features)
print(f"\nJumlah: {len(usable_features)}")

# Drop kolom id
df_clean = df.drop(columns=unusable_features)
print(f"\nShape dataset setelah menghapus kolom 'id': {df_clean.shape}")

Variabel yang TIDAK dapat digunakan:
['id']

Jumlah: 1


Variabel yang DAPAT digunakan:
['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32']

Jumlah: 32

Shape dataset setelah menghapus kolom 'id': (569, 32)


In [4]:
# Cek nilai unik pada kolom diagnosis sebelum encoding
print("Nilai unik pada kolom 'diagnosis' sebelum encoding:")
print(df_clean['diagnosis'].value_counts())
print("\nTipe data:", df_clean['diagnosis'].dtype)

# Lakukan encoding menggunakan LabelEncoder
label_encoder = LabelEncoder()
df_clean['diagnosis'] = label_encoder.fit_transform(df_clean['diagnosis'])

print("\n" + "="*50)
print("\nNilai setelah encoding:")
print(df_clean['diagnosis'].value_counts())
print("\nTipe data:", df_clean['diagnosis'].dtype)

# Mapping encoding
print("\nMapping encoding:")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {label} → {label_encoder.transform([label])[0]}")

print(f"\n5 baris pertama setelah encoding:")
print(df_clean.head())

Nilai unik pada kolom 'diagnosis' sebelum encoding:
diagnosis
B    357
M    212
Name: count, dtype: int64

Tipe data: object


Nilai setelah encoding:
diagnosis
0    357
1    212
Name: count, dtype: int64

Tipe data: int64

Mapping encoding:
  B → 0
  M → 1

5 baris pertama setelah encoding:
   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0          1        17.99         10.38          122.80     1001.0   
1          1        20.57         17.77          132.90     1326.0   
2          1        19.69         21.25          130.00     1203.0   
3          1        11.42         20.38           77.58      386.1   
4          1        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974   

In [5]:
# Identifikasi kolom numerik (kecuali diagnosis yang sudah di-encode)
numeric_columns = df_clean.columns.drop('diagnosis')

print("Kolom numerik yang akan di-standardisasi:")
print(list(numeric_columns))
print(f"\nJumlah kolom: {len(numeric_columns)}")

# Tampilkan statistik sebelum standardisasi
print("\n" + "="*50)
print("\nStatistik SEBELUM standardisasi (5 kolom pertama):")
print(df_clean[numeric_columns[:5]].describe())

Kolom numerik yang akan di-standardisasi:
['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32']

Jumlah kolom: 31


Statistik SEBELUM standardisasi (5 kolom pertama):
       radius_mean  texture_mean  perimeter_mean    area_mean  smoothness_mean
count   569.000000    569.000000      569.000000   569.000000       569.000000
mean     14.127292     19.289649       91.969033   654.889104         0.096360
std       3.524049      4.301036       24.298981   351.914129         0.014064
min       

In [6]:
# Lakukan standardisasi menggunakan StandardScaler
scaler = StandardScaler()
df_clean[numeric_columns] = scaler.fit_transform(df_clean[numeric_columns])

# Tampilkan statistik setelah standardisasi
print("Statistik SETELAH standardisasi (5 kolom pertama):")
print(df_clean[numeric_columns[:5]].describe())

print("\n" + "="*50)
print("\nPerhatikan bahwa mean ≈ 0 dan std ≈ 1 setelah standardisasi")

# Tampilkan beberapa baris data setelah standardisasi
print("\n" + "="*50)
print("\n5 baris pertama setelah standardisasi:")
print(df_clean.head())

Statistik SETELAH standardisasi (5 kolom pertama):
        radius_mean  texture_mean  perimeter_mean     area_mean  \
count  5.690000e+02  5.690000e+02    5.690000e+02  5.690000e+02   
mean  -1.373633e-16  6.868164e-17   -1.248757e-16 -2.185325e-16   
std    1.000880e+00  1.000880e+00    1.000880e+00  1.000880e+00   
min   -2.029648e+00 -2.229249e+00   -1.984504e+00 -1.454443e+00   
25%   -6.893853e-01 -7.259631e-01   -6.919555e-01 -6.671955e-01   
50%   -2.150816e-01 -1.046362e-01   -2.359800e-01 -2.951869e-01   
75%    4.693926e-01  5.841756e-01    4.996769e-01  3.635073e-01   
max    3.971288e+00  4.651889e+00    3.976130e+00  5.250529e+00   

       smoothness_mean  
count     5.690000e+02  
mean     -8.366672e-16  
std       1.000880e+00  
min      -3.112085e+00  
25%      -7.109628e-01  
50%      -3.489108e-02  
75%       6.361990e-01  
max       4.770911e+00  


Perhatikan bahwa mean ≈ 0 dan std ≈ 1 setelah standardisasi


5 baris pertama setelah standardisasi:
   diagnosis  rad

In [7]:
print("="*60)
print("RINGKASAN PREPROCESSING WISCONSIN BREAST CANCER DATASET")
print("="*60)

print("\n1. PEMISAHAN VARIABEL:")
print(f"   - Variabel tidak digunakan: {unusable_features}")
print(f"   - Variabel digunakan: {len(usable_features)} kolom")

print("\n2. ENCODING KOLOM 'diagnosis':")
print(f"   - B (Benign/Jinak) → 0")
print(f"   - M (Malignant/Ganas) → 1")
print(f"   - Distribusi: {dict(df_clean['diagnosis'].value_counts())}")

print("\n3. STANDARDISASI KOLOM NUMERIK:")
print(f"   - Jumlah kolom di-standardisasi: {len(numeric_columns)}")
print(f"   - Metode: StandardScaler (mean=0, std=1)")

print("\n4. DATASET AKHIR:")
print(f"   - Shape: {df_clean.shape}")
print(f"   - Kolom: {list(df_clean.columns)}")

print("\n5. INFO DATASET AKHIR:")
print(df_clean.info())

print("\n" + "="*60)
print("Dataset siap digunakan untuk modeling!")
print("="*60)

RINGKASAN PREPROCESSING WISCONSIN BREAST CANCER DATASET

1. PEMISAHAN VARIABEL:
   - Variabel tidak digunakan: ['id']
   - Variabel digunakan: 32 kolom

2. ENCODING KOLOM 'diagnosis':
   - B (Benign/Jinak) → 0
   - M (Malignant/Ganas) → 1
   - Distribusi: {0: np.int64(357), 1: np.int64(212)}

3. STANDARDISASI KOLOM NUMERIK:
   - Jumlah kolom di-standardisasi: 31
   - Metode: StandardScaler (mean=0, std=1)

4. DATASET AKHIR:
   - Shape: (569, 32)
   - Kolom: ['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fract