In [1]:
import numpy as np
import os
import pandas as pd

In [2]:
os.chdir("../")

In [3]:
%pwd

'c:\\Users\\Amit\\Notebooks\\Alianz_Assignment\\E2E_Term_Scheme'

In [35]:
# entity

from dataclasses import dataclass
from pathlib import Path

In [53]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [37]:
from mlproject.constants import *
from mlproject.utils.common import read_yaml,create_directories

In [54]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,   # all imported from constants 
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config



In [39]:
import os
from mlproject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder,StandardScaler
#from mlproject.entity.config_entity import DataTransformationConfig

In [57]:
## Component
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    
    ## Note: You can add different data transformation techniques such as Scaler, PCA and all
    #You can perform all kinds of EDA in ML cycle here before passing this data to the model

    # I am only adding train_test_spliting cz this data is already cleaned up


    def processor(self):
        le =LabelEncoder()
        oe = OrdinalEncoder()
        ohe = OneHotEncoder()
        ss = StandardScaler()

        data = pd.read_csv(self.config.data_path,sep=";")

        # Split the data into training and test sets. (0.75, 0.25) split.
        train, test = train_test_split(data,test_size=0.2,random_state=42)
        train['y_enc']=le.fit_transform(train['y'])
        test['y_enc']=le.fit_transform(test['y'])

        std_cols = ['age','duration','emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed']
        
        cat_cols = ['job', 'marital', 'education', 'default',
       'contact', 'month', 'day_of_week','poutcome']
        
        other_cols = ['campaign','pdays','previous']
        target = ['y']

        no_association_cols = ['housing','loan']

        #Preparation of train data
        ss = StandardScaler().fit(train[std_cols])
        train_std=pd.DataFrame(ss.transform(train[std_cols]),columns=std_cols,index=train.index)

        train['campaign_rec'] = 1/train['campaign']
        train['previous_plus1'] = train['previous']+1

        train_cat = pd.get_dummies(train[cat_cols])
        train = train.join(train_cat)

        drop_cols = ['job_unknown',
                'marital_unknown',
                'education_illiterate',
                'default_yes',
                'contact_telephone',
                'month_dec',
                'day_of_week_fri',
                'poutcome_failure'] + std_cols + cat_cols +['housing','loan','pdays','campaign','previous']
        
        train.drop(columns=drop_cols,axis=1,inplace=True)

        train = train.join(train_std)

        selected_cols =['campaign_rec', 'job_admin.', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_services', 'job_student',
       'job_technician', 'job_unemployed', 'education_basic.4y',
       'education_basic.9y', 'education_high.school',
       'education_professional.course', 'education_unknown', 'default_unknown',
       'contact_cellular', 'month_apr', 'month_aug', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'day_of_week_mon', 'day_of_week_tue',
       'day_of_week_wed', 'poutcome_nonexistent', 'poutcome_success',
       'duration', 'emp.var.rate', 'cons.price.idx', 'euribor3m',
       'nr.employed','y_enc']
        
        train = train[selected_cols]
        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        
        #Preparation of test data

        test_std=pd.DataFrame(ss.transform(test[std_cols]),columns=std_cols,index=test.index)

        test['campaign_rec'] = 1/test['campaign']
        test['previous_plus1'] = test['previous']+1

        test_cat = pd.get_dummies(test[cat_cols])
        test = test.join(test_cat)

        drop_cols_test = [i for i in drop_cols if i != 'default_yes']
        
        test.drop(columns=drop_cols_test,axis=1,inplace=True)

        test = test.join(test_std)

        #selected_cols_test =selected_cols+['default_yes']
        
        test = test[selected_cols]
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)


    #def preprocessor(self):
        


In [58]:
try:
            
    with open(Path("artifacts/data_validation/status.txt"), "r") as f:
        status = f.read().split(" ")[-1]

    if status == "True":
        config = ConfigurationManager()
        data_transformation_config = config.get_data_transformation_config()
        data_transformation = DataTransformation(config=data_transformation_config)
        data_transformation.processor()

    else:
        raise Exception("You data schema is not valid")
except Exception as e:
            print(e)

[2024-05-20 20:59:53,178: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-05-20 20:59:53,178: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-20 20:59:53,178: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-05-20 20:59:53,186: INFO: common: created directory at: artifacts]
[2024-05-20 20:59:53,187: INFO: common: created directory at: artifacts/data_transformation]
[2024-05-20 20:59:53,867: INFO: 3591753637: Splited data into training and test sets]
[2024-05-20 20:59:53,871: INFO: 3591753637: (32950, 34)]
[2024-05-20 20:59:53,871: INFO: 3591753637: (8238, 34)]
(32950, 34)
(8238, 34)
