In [3]:
import pandas as pd
import os
test_df = pd.DataFrame({'test_column': [1, 2, 3]})
test_df.to_csv(r'E:\churn\notebooks\app\data\test_file.csv', index=False)
if os.path.exists("test_file.csv"):
    print("file saved")
else:
    print("file not saved")

file not saved


In [4]:
# --- Imports ---
import pandas as pd
import numpy as np
from pathlib import Path

# For modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# --- Paths ---
NOTEBOOK_DIR = Path.cwd()
DATA_DIR = NOTEBOOK_DIR / "data"
RAW_DATA_PATH = DATA_DIR / "WA_Fn-UseC_-Telco-Customer-Churn.csv"
CLEANED_DATA_PATH = DATA_DIR / "cleaned_telco_customers.csv"

# --- Load & Clean Data ---
def load_and_clean_kaggle_data():
    """
    Loads Kaggle Telco Churn dataset, cleans it, saves cleaned CSV, and returns DataFrame.
    """
    print("📥 Loading Kaggle Telco Churn dataset...")
    df = pd.read_csv(RAW_DATA_PATH)

    # Replace spaces with NaN
    df = df.replace(" ", np.nan)

    # Drop missing values
    df = df.dropna()

    # Convert TotalCharges to numeric
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

    # Encode Churn column
    df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

    # Encode categorical variables
    for col in df.select_dtypes(include="object").columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

    print(f"✅ Kaggle data loaded and cleaned successfully. Shape: {df.shape}")

    # Save cleaned dataset for Streamlit
    DATA_DIR.mkdir(exist_ok=True)
    df.to_csv(CLEANED_DATA_PATH, index=False)
    print(f"💾 Cleaned data saved to {CLEANED_DATA_PATH}")

    return df

# --- Train Model ---
def train_model(df):
    """
    Splits data, trains Logistic Regression, and prints performance.
    """
    X = df.drop("Churn", axis=1)
    y = df["Churn"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print("📊 Classification Report:\n", classification_report(y_test, y_pred))
    print("🎯 ROC-AUC Score:", roc_auc_score(y_test, y_proba))

    return model

# --- Main Pipeline ---
if __name__ == "__main__":
    df_clean = load_and_clean_kaggle_data()
    model = train_model(df_clean)


📥 Loading Kaggle Telco Churn dataset...
✅ Kaggle data loaded and cleaned successfully. Shape: (7032, 21)
💾 Cleaned data saved to E:\churn\notebooks\app\data\cleaned_telco_customers.csv
📊 Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.88      0.86      1033
           1       0.62      0.56      0.59       374

    accuracy                           0.79      1407
   macro avg       0.73      0.72      0.72      1407
weighted avg       0.79      0.79      0.79      1407

🎯 ROC-AUC Score: 0.8322315461430547


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
