## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

## Configurations

In [2]:
# path for the training and testing datasets
train_data_path = 'C:/Users/ansar/Desktop/Workspace/Personal/MLOPs/Customer Churn Prediction/api/artifacts/raw/train.csv'
test_data_path = 'C:/Users/ansar/Desktop/Workspace/Personal/MLOPs/Customer Churn Prediction/api/artifacts/raw/test.csv'

# resetting pandas settings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# loading training and testing datasets
df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path)

In [4]:
# converting the TotalCharges feature into numeric
df_train['TotalCharges'] = pd.to_numeric(df_train['TotalCharges'], errors='coerce')
df_test['TotalCharges'] = pd.to_numeric(df_test['TotalCharges'], errors='coerce')

# dropping the customerID column
df_train.drop('customerID', axis=1, inplace=True)
df_test.drop('customerID', axis=1, inplace=True)

# separating numeric, categorical, target features
log_feature = ['TotalCharges']
numeric_features = df_train.select_dtypes(include=['int64', 'float64']).columns.difference(['SeniorCitizen', 'TotalCharges']).tolist()
categorical_features = df_train.select_dtypes(include='object').columns.difference(['Churn']).tolist()

# mapping the target feature Yes:1, No:0
df_train['Churn'] = df_train['Churn'].map({'Yes': 1, 'No': 0})
df_test['Churn'] = df_test['Churn'].map({'Yes': 1, 'No': 0})

In [5]:
# skewed transformation pipeline
log_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0.0)),
    ('log_transform', FunctionTransformer(np.log1p, feature_names_out='one-to-one')),
    ('scaler', StandardScaler())
])

In [6]:
# numeric transformation pipeline
numeric_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

In [7]:
# categorical transformation pipeline
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [8]:
# full pipeline
preprocessor = ColumnTransformer([
    ('log', log_pipeline, log_feature),
    ('numeric', numeric_pipeline, numeric_features),
    ('categorical', categorical_pipeline, categorical_features)
])

In [9]:
# separating X and y features
X_train = df_train.drop('Churn', axis=1)
X_test = df_test.drop('Churn', axis=1)

y_train = df_train['Churn']
y_test = df_test['Churn']

In [10]:
# fitting the pipeline
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [11]:
# getting feature names
feature_names = preprocessor.get_feature_names_out()

# creating dataframes
train_processed = pd.DataFrame(X_train_processed, columns=feature_names)
test_processed = pd.DataFrame(X_test_processed, columns=feature_names)

# adding the target feature
train_processed['Churn'] = y_train.values
test_processed['Churn'] = y_test.values

In [12]:
# saving the datasets
train_processed.to_csv('./data/train_processed.csv', index=False)
test_processed.to_csv('./data/test_processed.csv', index=False)

print('Datasets saved successfully.')

Datasets saved successfully.


In [13]:
train_processed.shape

(5634, 45)

In [14]:
test_processed.shape

(1409, 45)