In [79]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import joblib
import os


def load_raw(filepath="../data/raw/Baza customer Telecom v2.csv"):
    df = pd.read_csv(filepath)
    return df




In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8453 entries, 0 to 8452
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   PID                     8453 non-null   object 
 1   CRM_PID_Value_Segment   8448 non-null   object 
 2   EffectiveSegment        8453 non-null   object 
 3   Billing_ZIP             8451 non-null   float64
 4   KA_name                 8453 non-null   object 
 5   Active_subscribers      8453 non-null   int64  
 6   Not_Active_subscribers  4304 non-null   float64
 7   Suspended_subscribers   352 non-null    float64
 8   Total_SUBs              8453 non-null   int64  
 9   AvgMobileRevenue        8453 non-null   float64
 10  AvgFIXRevenue           8453 non-null   float64
 11  TotalRevenue            8453 non-null   float64
 12  ARPU                    8452 non-null   float64
 13  CHURN                   8453 non-null   object 
dtypes: float64(7), int64(2), object(5)
memor

In [81]:
df.isnull().sum()

PID                          0
CRM_PID_Value_Segment        5
EffectiveSegment             0
Billing_ZIP                  2
KA_name                      0
Active_subscribers           0
Not_Active_subscribers    4149
Suspended_subscribers     8101
Total_SUBs                   0
AvgMobileRevenue             0
AvgFIXRevenue                0
TotalRevenue                 0
ARPU                         1
CHURN                        0
dtype: int64

In [82]:
df.columns

Index(['PID', 'CRM_PID_Value_Segment', 'EffectiveSegment', 'Billing_ZIP',
       'KA_name', 'Active_subscribers', 'Not_Active_subscribers',
       'Suspended_subscribers', 'Total_SUBs', 'AvgMobileRevenue',
       'AvgFIXRevenue', 'TotalRevenue', 'ARPU', 'CHURN'],
      dtype='object')

In [83]:
df.describe(include='all')

Unnamed: 0,PID,CRM_PID_Value_Segment,EffectiveSegment,Billing_ZIP,KA_name,Active_subscribers,Not_Active_subscribers,Suspended_subscribers,Total_SUBs,AvgMobileRevenue,AvgFIXRevenue,TotalRevenue,ARPU,CHURN
count,8453.0,8448,8453,8451.0,8453,8453.0,4304.0,352.0,8453.0,8453.0,8453.0,8453.0,8452.0,8453
unique,8436.0,9,6,,12,,,,,,,,,2
top,2020000000000.0,Bronze,SOHO,,VM,,,,,,,,,No
freq,5.0,3820,6301,,769,,,,,,,,,7904
mean,,,,4879.727725,,7.774636,4.163336,1.576705,9.960132,148.011956,0.821185,148.833141,24.441789,
std,,,,1061.095394,,6.680524,9.462847,1.979905,10.246648,102.570539,11.73788,103.250779,22.820585,
min,,,,1000.0,,1.0,1.0,1.0,1.0,0.0,0.0,4.67,0.0,
25%,,,,4003.0,,4.0,1.0,1.0,5.0,71.5,0.0,71.83,14.07,
50%,,,,4400.0,,6.0,2.0,1.0,7.0,113.17,0.0,113.67,19.315,
75%,,,,6000.0,,10.0,4.0,1.0,12.0,191.17,0.0,192.33,27.255,


In [84]:
df.head()

Unnamed: 0,PID,CRM_PID_Value_Segment,EffectiveSegment,Billing_ZIP,KA_name,Active_subscribers,Not_Active_subscribers,Suspended_subscribers,Total_SUBs,AvgMobileRevenue,AvgFIXRevenue,TotalRevenue,ARPU,CHURN
0,123759242,Bronze,SOHO,6000.0,VM,2,,,2,40.17,0.0,40.17,,No
1,126145737,Bronze,SOHO,6400.0,VM,3,,,3,40.17,0.0,40.17,13.39,No
2,123506355,Bronze,SOHO,6000.0,DI,2,3.0,,5,40.17,0.0,40.17,20.09,No
3,112595585,Bronze,SOHO,4400.0,MT,1,2.0,,3,40.17,0.0,40.17,40.17,No
4,115097935,Iron,SOHO,4000.0,AD,2,1.0,,3,40.17,0.0,40.17,20.09,No


In [85]:
def basic_clean(df):
    df.columns = df.columns.str.strip()  # strip extra spaces
    df = df.drop_duplicates()
    df = df.replace(["NA", "null", None], np.nan)
    return df


In [86]:
def prepare_features(df, target_col="CHURN"):
    X = df.drop(columns=[target_col])
    y = df[target_col].values

    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

    numeric_transformer = SimpleImputer(strategy="mean")
    categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

    preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

    X_processed = preprocessor.fit_transform(X)
    feature_names = numeric_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))

    
    return X_processed, y, preprocessor, feature_names


In [87]:
def train_test_pipeline(df, target_col="CHURN", test_size=0.2, processed_dir="../data/processed"):
    """
    Split the data into train and test sets, preprocess features, save CSVs and the preprocessor.
    """
    os.makedirs(processed_dir, exist_ok=True)

    X, y, preprocessor, feature_names = prepare_features(df, target_col)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    
    train_df = pd.DataFrame(X_train, columns=feature_names)
    train_df[target_col] = y_train

    test_df = pd.DataFrame(X_test, columns=feature_names)
    test_df[target_col] = y_test

    train_path = os.path.join(processed_dir, "train_data.csv")
    test_path = os.path.join(processed_dir, "test_data.csv")
    prep_path = os.path.join(processed_dir, "preprocessor.joblib")

    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)
    joblib.dump(preprocessor, prep_path)

 
    print(f"Train CSV saved at: {os.path.abspath(train_path)}")
    print(f"Test CSV saved at:  {os.path.abspath(test_path)}")
    print(f"Preprocessor saved at: {os.path.abspath(prep_path)}")

    return train_df, test_df, preprocessor


df = load_raw("../data/raw/Baza customer Telecom v2.csv")
df = basic_clean(df)


train_df, test_df, preprocessor = train_test_pipeline(df, target_col="CHURN")





Train CSV saved at: c:\Users\Usser\Desktop\Churn-Customer\data\processed\train_data.csv
Test CSV saved at:  c:\Users\Usser\Desktop\Churn-Customer\data\processed\test_data.csv
Preprocessor saved at: c:\Users\Usser\Desktop\Churn-Customer\data\processed\preprocessor.joblib
