In [1]:
import os

In [2]:
%pwd

'd:\\NLP\\NLP Projects\\Churn-Prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\NLP\\NLP Projects\\Churn-Prediction'

In [5]:
import pandas as pd

In [6]:
from dataclasses import dataclass
from pathlib import Path
@dataclass
class DataTransformationConfig:
    root_dir: Path
    local_data_file: Path
    train_csv: Path
    test_csv: Path
    X_train_trans: Path
    X_test_trans: Path
    y_train: Path
    y_test: Path
    target_column: str

In [7]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(self,config_path=CONFIG_FILE_PATH,schema_path=SCHEMA_FILE_PATH,param_path=PARAMS_FILE_PATH):
        self.config_path=read_yaml(config_path)
        self.schema_path=read_yaml(schema_path)
        self.param_path=read_yaml(param_path)
        
        create_directories([self.config_path.artifacts_root])
        
    
    def get_data_datatransformation(self) -> DataTransformationConfig:
        config=self.config_path.data_transformation
        schema=self.schema_path.TARGET_COLUMN
        create_directories([config.root_dir])
        data_tranformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            local_data_file=config.local_data_file,
            train_csv=config.train_csv,
            test_csv=config.test_csv,
            X_train_trans=config.X_train_trans,
            X_test_trans=config.X_test_trans,
            y_train=config.y_train,
            y_test=config.y_test,
            target_column=schema.name
            
        )
        return data_tranformation_config
        
        
        
        

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from scipy.sparse import save_npz
import numpy as np

In [10]:
from mlProject.Logging import get_logger
logger=get_logger(__name__)
import joblib

In [11]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config=config
        
        pass
    
    def drop_columns(self):
        df=pd.read_csv(self.config.local_data_file)
        df.drop(columns=['City','Signup_Quarter'],inplace=True)
        return df
    
    def fill_missing_data(self):
        df=self.drop_columns()
        df['Age']=df['Age'].fillna(df['Age'].median())
        df['Session_Duration_Avg']=df['Session_Duration_Avg'].fillna(df['Session_Duration_Avg'].mean())
        df['Pages_Per_Session']=df['Pages_Per_Session'].fillna(df['Pages_Per_Session'].mean())
        df['Wishlist_Items']=df['Wishlist_Items'].fillna(df['Wishlist_Items'].mean())
        df['Days_Since_Last_Purchase']=df['Days_Since_Last_Purchase'].fillna(df['Days_Since_Last_Purchase'].median())
        df['Discount_Usage_Rate']=df['Discount_Usage_Rate'].fillna(df['Discount_Usage_Rate'].mean())
        df['Returns_Rate']=df['Returns_Rate'].fillna(df['Returns_Rate'].mean())
        df['Email_Open_Rate']=df['Email_Open_Rate'].fillna(df['Email_Open_Rate'].mean())
        df['Customer_Service_Calls']=df['Customer_Service_Calls'].fillna(df['Customer_Service_Calls'].mean())
        df['Product_Reviews_Written']=df['Product_Reviews_Written'].fillna(df['Product_Reviews_Written'].mean())
        df['Social_Media_Engagement_Score']=df['Social_Media_Engagement_Score'].fillna(df['Social_Media_Engagement_Score'].mean())
        df['Mobile_App_Usage']=df['Mobile_App_Usage'].fillna(df['Mobile_App_Usage'].mean())
        df['Payment_Method_Diversity']=df['Payment_Method_Diversity'].fillna(df['Payment_Method_Diversity'].mean())
        df['Credit_Balance']=df['Credit_Balance'].fillna(df['Credit_Balance'].mean())
        
        return df
    
    
    
    def train_test_split(self):
        df=self.fill_missing_data()
        train,test=train_test_split(df,test_size=0.2,random_state=42)
        train.to_csv(self.config.train_csv,index=False)
        test.to_csv(self.config.test_csv,index=False)
        
        logger.info("data splitted in to training and test sets")
    
    
    def get_preprocessor(self,df: pd.DataFrame):
        num_pipeline=Pipeline(
            steps=[
                ("scaler", StandardScaler())
            ]
        )
        
        cat_pipeline=Pipeline(
            steps=[
                ('onehot',OneHotEncoder())
            ]
        )
        
        
        numerical_cols=df.select_dtypes(include=['int','float']).columns.tolist()
        numerical_cols.remove(self.config.target_column)
        
        cat_cols=df.select_dtypes(include=['object']).columns.tolist()
        
        
        preprocessor=ColumnTransformer(
            transformers=[
                ("num",num_pipeline,numerical_cols),
                ('cat',cat_pipeline,cat_cols)
            ]
        )
        
        return preprocessor
        
    def transformed(self):
        train_df=pd.read_csv(self.config.train_csv)
        test_df=pd.read_csv(self.config.test_csv)
        preprocessor =  self.get_preprocessor(train_df)
        X_train=train_df.drop(columns=[self.config.target_column])
        y_train=train_df[self.config.target_column]
        X_test=test_df.drop(columns=[self.config.target_column])
        y_test=test_df[self.config.target_column]
        
        X_train_transformed=preprocessor.fit_transform(X_train)
        X_test_transformed=preprocessor.transform(X_test)
        
        joblib.dump(X_train_transformed,self.config.X_train_trans)
        joblib.dump(X_test_transformed,self.config.X_test_trans)
        joblib.dump(y_train,self.config.y_train)
        joblib.dump(y_test,self.config.y_test)
       
        
        
        
    
        

In [12]:
obj_manager=ConfigurationManager()
data_tranformation_config=  obj_manager.get_data_datatransformation()

obj_trans=DataTransformation(data_tranformation_config)

obj_trans.train_test_split()
obj_trans.transformed()

2026-01-14 12:37:34,575 | INFO | mlProject.utils.common | common.py:31 | yaml file: config\config.yaml loaded successfully
2026-01-14 12:37:34,590 | INFO | mlProject.utils.common | common.py:31 | yaml file: schema.yaml loaded successfully
2026-01-14 12:37:34,600 | INFO | mlProject.utils.common | common.py:31 | yaml file: params.yaml loaded successfully
2026-01-14 12:37:34,602 | INFO | mlProject.utils.common | common.py:51 | created directory at: artifacts
2026-01-14 12:37:34,604 | INFO | mlProject.utils.common | common.py:51 | created directory at: artifacts/data_transformation
2026-01-14 12:37:35,160 | INFO | __main__ | 2465165239.py:39 | data splitted in to training and test sets
