In [4]:
import pandas as pd
from google.cloud import bigquery
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# --- 1. KONFIGURASI ---
PROJECT_ID = "numeric-polygon-468208-a7"
DATASET_ID = "churn_transformed_data"
TABLE_ID = "stg_telco__customers"
MODEL_FILENAME = "churn_model.pkl"

# --- 2. AMBIL DATA DARI BIGQUERY ---
print("Mengambil data dari BigQuery...")
client = bigquery.Client(project=PROJECT_ID)
query = f"SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}`"
df = client.query(query).to_dataframe()
print(f"Data berhasil diambil. Jumlah baris: {len(df)}")


# --- 3. PERSIAPAN DATA DAN PEMBERSIHAN EKSPLISIT ---

# PERUBAHAN KUNCI 1: Pisahkan X dan y SEKARANG JUGA.
# Ini untuk memastikan operasi pembersihan pada X tidak akan pernah 'bocor' dan mempengaruhi y.
X = df.drop(['is_churn', 'customer_id'], axis=1)
y = df['is_churn']

print("\n--- DIAGNOSTIK DATA TARGET (y) SEBELUM PEMBERSIHAN ---")
# Kita cek kondisi 'y' sebelum kita melakukan apa pun.
# dropna=False akan menunjukkan jika ada nilai Null (NaN)
print(y.value_counts(dropna=False))


# PERUBAHAN KUNCI 2: Lakukan pembersihan HANYA pada DataFrame X.
print("\nMembersihkan DataFrame Fitur (X)...")
cols_to_convert_to_numeric = ['tenure_months', 'monthly_charges', 'total_charges']

for col in cols_to_convert_to_numeric:
    # Ubah menjadi numerik, paksa error menjadi NaN
    X[col] = pd.to_numeric(X[col], errors='coerce')
    # Isi NaN hanya di kolom ini dengan median kolom ini
    X[col].fillna(X[col].median(), inplace=True)

print("Pengecekan nilai null di X setelah dibersihkan:")
print(X.isnull().sum().sum()) # Harus mencetak 0


# PERUBAHAN KUNCI 3: Pastikan 'y' adalah tipe data integer standar.
# Tipe 'Int64' (dengan huruf besar I) bisa menyebabkan masalah. Kita ubah ke 'int' standar.
y = y.astype(int)

print("\n--- DIAGNOSTIK DATA TARGET (y) SETELAH PEMBERSIHAN ---")
print(y.value_counts(dropna=False))


# --- 4. PEMBAGIAN DATA ---
print("\nMembagi data menjadi set training dan testing...")
numerical_features = X.select_dtypes(include=['number', 'Int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'string', 'bool', 'boolean']).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\n--- DIAGNOSTIK DATA TARGET (y_train) SETELAH SPLIT ---")
# Ini adalah tes terakhir. y_train HARUS punya 2 kelas.
print(y_train.value_counts())


# --- 5. BUAT PIPELINE & LATIH MODEL ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop'
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

print("\nMemulai training model...")
model_pipeline.fit(X_train, y_train)
print("Training model selesai.")


# --- 6. EVALUASI MODEL ---
print("\nMengevaluasi performa model...")
y_pred = model_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAkurasi model: {accuracy:.4f}")
print("\nLaporan Klasifikasi:")
print(classification_report(y_test, y_pred))


# --- 7. SIMPAN MODEL ---
print(f"\nMenyimpan model ke file '{MODEL_FILENAME}'...")
joblib.dump(model_pipeline, MODEL_FILENAME)
print("Model berhasil disimpan.")

Mengambil data dari BigQuery...




Data berhasil diambil. Jumlah baris: 7043

--- DIAGNOSTIK DATA TARGET (y) SEBELUM PEMBERSIHAN ---
is_churn
0    5174
1    1869
Name: count, dtype: Int64

Membersihkan DataFrame Fitur (X)...
Pengecekan nilai null di X setelah dibersihkan:
0

--- DIAGNOSTIK DATA TARGET (y) SETELAH PEMBERSIHAN ---
is_churn
0    5174
1    1869
Name: count, dtype: int64

Membagi data menjadi set training dan testing...

--- DIAGNOSTIK DATA TARGET (y_train) SETELAH SPLIT ---
is_churn
0    4139
1    1495
Name: count, dtype: int64

Memulai training model...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

Training model selesai.

Mengevaluasi performa model...

Akurasi model: 0.7977

Laporan Klasifikasi:
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1035
           1       0.64      0.53      0.58       374

    accuracy                           0.80      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409


Menyimpan model ke file 'churn_model.pkl'...
Model berhasil disimpan.
