# Split Data for Training and Testing

In [None]:
# Import train_test_split
from sklearn.model_selection import train_test_split

In [None]:
# load processed data and inspect
import pandas as pd
from pathlib import Path
SEED = 42

proc_path = Path("../data/processed/telecom_churn_processed.csv") 
print("Loading:", proc_path.resolve())

df = pd.read_csv(proc_path)
print("Shape:", df.shape)
display(df.head(5))
print("\nColumns:", list(df.columns))
print("\nData types:\n", df.dtypes)
print("\nMissing values per column:\n", df.isna().sum())
print("\nTarget distribution (Churn):\n", df['Churn'].value_counts(dropna=False), "\n")
print("Unique values in target:", df['Churn'].unique())
print("Duplicate rows:", df.duplicated().sum())


Loading: C:\Users\arsem\Desktop\Customer_Churn_Analysis\data\processed\telecom_churn_processed.csv
Shape: (7032, 20)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,1,0,0,0,34,1,No,DSL,Yes,No,Yes,No,No,No,One year,0,Mailed check,56.95,1889.5,0
2,1,0,0,0,2,1,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
3,1,0,0,0,45,0,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,0,0,0,0,2,1,No,Fiber optic,No,No,No,No,No,No,Month-to-month,1,Electronic check,70.7,151.65,1



Columns: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Data types:
 gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling      int64
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object

Missing values per column:
 gender              0
SeniorCitizen       0
Partner            

In [3]:
print("\nDescriptive Statistics:\n", df.describe())


Descriptive Statistics:
             gender  SeniorCitizen      Partner   Dependents       tenure  \
count  7032.000000    7032.000000  7032.000000  7032.000000  7032.000000   
mean      0.504693       0.162400     0.482509     0.298493    32.421786   
std       0.500014       0.368844     0.499729     0.457629    24.545260   
min       0.000000       0.000000     0.000000     0.000000     1.000000   
25%       0.000000       0.000000     0.000000     0.000000     9.000000   
50%       1.000000       0.000000     0.000000     0.000000    29.000000   
75%       1.000000       0.000000     1.000000     1.000000    55.000000   
max       1.000000       1.000000     1.000000     1.000000    72.000000   

       PhoneService  PaperlessBilling  MonthlyCharges  TotalCharges  \
count   7032.000000       7032.000000     7032.000000   7032.000000   
mean       0.903299          0.592719       64.798208   2283.300441   
std        0.295571          0.491363       30.085974   2266.771362   
min  

In [4]:
# Cell 6: Remove duplicate rows
df = df.drop_duplicates()
print("Duplicate rows after removal:", df.duplicated().sum())

Duplicate rows after removal: 0


In [5]:
# Cell 7: One-hot encoding for categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

In [6]:
# Cell 8: Prepare features and target variable after encoding
X = df_encoded.drop(columns=['Churn'])  # Features
y = df_encoded['Churn']  # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

In [7]:
# Cell 9: Check target distribution in training and testing sets
print("Training set target distribution (Churn):\n", y_train.value_counts(normalize=True))
print("\nTesting set target distribution (Churn):\n", y_test.value_counts(normalize=True))

Training set target distribution (Churn):
 Churn
0    0.735021
1    0.264979
Name: proportion, dtype: float64

Testing set target distribution (Churn):
 Churn
0    0.735378
1    0.264622
Name: proportion, dtype: float64


In [8]:
# Cell to save features and target
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)