In [3]:
import pandas as pd
import sqlite3
import os
from sklearn.model_selection import StratifiedShuffleSplit

# 1. Connect to Database
db_path_options = [
    "../data/churn.db"
]

DB_NAME = None
for path in db_path_options:
    if os.path.exists(path):
        DB_NAME = path
        break

if DB_NAME is None:
    print("❌ Cannot find the database file. Please check the path.")
else:
    print(f"✅ Connect to database: {DB_NAME}")

    # 2. Execute SQL Query to fetch data
    # ---------------------------------------------------------
    conn = sqlite3.connect(DB_NAME)
    
    query = """
    SELECT 
        c.customer_id, c.surname, c.gender, c.age, 
        g.geography_name as Geography, 
        s.credit_score, s.tenure, s.balance, s.num_of_products, 
        s.has_cr_card, s.is_active_member, s.estimated_salary, 
        s.exited
    FROM customer_info c
    JOIN credit_stats s ON c.customer_id = s.customer_id
    JOIN lookup_geography g ON c.geo_id = g.id
    """
    
    churning = pd.read_sql(query, conn)
    conn.close()

    print(f"Data loaded successfully! Shape: {churning.shape}")
    print(churning.head(3))

✅ Connect to database: ../data/churn.db
Data loaded successfully! Shape: (10000, 13)
   customer_id   surname  gender  age Geography  credit_score  tenure  \
0     15634602  Hargrave  Female   42    France           619       2   
1     15647311      Hill  Female   41     Spain           608       1   
2     15619304      Onio  Female   42    France           502       8   

     balance  num_of_products  has_cr_card  is_active_member  \
0       0.00                1            1                 1   
1   83807.86                1            0                 1   
2  159660.80                3            1                 0   

   estimated_salary  exited  
0         101348.88       1  
1         112542.58       0  
2         113931.57       1  


In [4]:
# 3. Explore the data to see if need for stratified splitting

print("\n=== Target Variable Distribution ===")
churn_rate = churning["exited"].value_counts(normalize=True)
print(churn_rate)


# 4. Stratified Shuffle Split

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(churning, churning["exited"]):
    strat_train_set = churning.loc[train_index]
    strat_test_set = churning.loc[test_index]

# 5. Test the stratification results

def churn_proportions(data):
    return data["exited"].value_counts(normalize=True)

compare_props = pd.DataFrame({
    "Overall": churn_proportions(churning),
    "Train": churn_proportions(strat_train_set),
    "Test": churn_proportions(strat_test_set),
}).sort_index()

print("\n=== Stratification Results ===")
print(compare_props)
print("\n✅ Stratified splitting completed successfully!")

# 6. Save stratified datasets to CSV files

save_dir = "../data"

strat_train_set.to_csv(os.path.join(save_dir, "strat_train_set.csv"), index=False)
strat_test_set.to_csv(os.path.join(save_dir, "strat_test_set.csv"), index=False)
print("Saved strat_train_set.csv & strat_test_set.csv")


=== Target Variable Distribution ===
exited
0    0.7963
1    0.2037
Name: proportion, dtype: float64

=== Stratification Results ===
        Overall    Train    Test
exited                          
0        0.7963  0.79625  0.7965
1        0.2037  0.20375  0.2035

✅ Stratified splitting completed successfully!
Saved strat_train_set.csv & strat_test_set.csv
