In [1]:
import pandas as pd
from sklearn.utils import resample

# 1. Load data CSV
df = pd.read_csv("data_lungsbaru.csv")

# 2. Cek distribusi target
print(df["LUNG_CANCER"].value_counts())

# 3. Pisahkan berdasarkan label target
df_yes = df[df["LUNG_CANCER"] == "YES"]
df_no = df[df["LUNG_CANCER"] == "NO"]

# 4. Tentukan target data
target_total = 1000
target_per_class = target_total // 2

# 5. Oversample masing-masing class
df_yes_upsampled = resample(df_yes, replace=True, n_samples=target_per_class, random_state=42)
df_no_upsampled = resample(df_no, replace=True, n_samples=target_per_class, random_state=42)

# 6. Gabung dan acak data
df_final = pd.concat([df_yes_upsampled, df_no_upsampled]).sample(frac=1, random_state=42).reset_index(drop=True)

# 7. Simpan ke file baru
df_final.to_csv("data_lungs_1000.csv", index=False)
print("Data berhasil disimpan ke 'data_lungs_1000.csv'")

LUNG_CANCER
YES    270
NO      39
Name: count, dtype: int64
Data berhasil disimpan ke 'data_lungs_1000.csv'


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import joblib

# Load data
data = pd.read_csv("data_lungs_1000.csv")

# Encode kolom GENDER dan LUNG_CANCER
data['GENDER'] = data['GENDER'].str.strip().map({'M': 1, 'F': 0})
data['LUNG_CANCER'] = data['LUNG_CANCER'].str.strip().map({'YES': 1, 'NO': 0})

# Hapus data duplikat
data = data.drop_duplicates()

# Pisahkan fitur dan target
X = data.drop(columns=['LUNG_CANCER'])
y = data['LUNG_CANCER']

# Normalisasi fitur
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Oversampling dengan SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split data untuk training dan testing
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

# Inisialisasi dan latih model SVM
model = SVC(kernel='rbf', probability=True, random_state=42)
model.fit(X_train, y_train)

# Evaluasi model
y_pred = model.predict(X_test)
print(f"Akurasi: {accuracy_score(y_test, y_pred):.2f}")
print("Laporan klasifikasi:\n", classification_report(y_test, y_pred))

# Simpan model dan scaler
joblib.dump(model, 'model_lung.pkl')
joblib.dump(scaler, 'scaler_lung.pkl')

# Fungsi prediksi
def predict_lung_cancer(input_data):
    input_df = pd.DataFrame([input_data])
    input_scaled = scaler.transform(input_df)
    prediction = model.predict(input_scaled)
    label = prediction[0]
    return "Positif Kanker Paru" if label == 1 else "Negatif Kanker Paru"

Akurasi: 0.96
Laporan klasifikasi:
               precision    recall  f1-score   support

           0       0.97      0.95      0.96        39
           1       0.95      0.98      0.96        41

    accuracy                           0.96        80
   macro avg       0.96      0.96      0.96        80
weighted avg       0.96      0.96      0.96        80

