In [1]:
import os

In [None]:
%pwd

In [3]:
os.chdir("../")

In [None]:
%pwd

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig():
    root_dir: Path
    data_path: Path
    

In [6]:
from src.irisdataprediction.constants import *
from src.irisdataprediction.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):
        self.config= read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)
        self.schema=read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])   

    def get_data_transformation_config(self)->DataTransformationConfig:
        config= self.config.data_transformation
        create_directories([config.root_dir])

        data_transformation_config= DataTransformationConfig(root_dir=config.root_dir, data_path=config.data_path)

        return data_transformation_config

In [17]:
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np
from src.irisdataprediction import logger
from src.irisdataprediction.exception import IrisPredictionException


In [24]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig, threshold=0):
        self.config=config
        self.threshold=threshold
        self.dataset=pd.read_csv(self.config.data_path)
        self.dataset_columns=self.dataset.columns
        self.transformed_dataset=None


    
    def data_standardization(self):
        num_pipeline=make_pipeline(SimpleImputer(strategy='median'),StandardScaler())
        cat_pipeline=make_pipeline(SimpleImputer(strategy='most_frequent'), OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan))
        preprocessing= make_column_transformer((num_pipeline, make_column_selector(dtype_include=np.number)),(cat_pipeline, make_column_selector(dtype_include=object)))

        self.dataset= preprocessing.fit_transform(self.dataset)
        self.transformed_dataset=pd.DataFrame(data=self.dataset,columns=self.dataset_columns, index=None)

    def remove_correlated_columns(self):

        col_corr=set() #set of the names of correlated columns
        corr_matrix=self.transformed_dataset.select_dtypes(include=[np.number]).corr()
        logger.info(f"<<<<<< Data Transformation: correlation between different columns >>>>>>")
        logger.info(self.transformed_dataset.select_dtypes(include=[np.number]).corr())
        logger.info(f"<<<<<< =========================== >>>>>>")
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i,j])> self.threshold: #absolute is to compare +ve and -ve correlated columns both with threshold.
                    colname= corr_matrix.columns[i] #getting the name of the column
                    col_corr.add(colname)
        self.transformed_dataset.drop(col_corr, axis=1,inplace=True)
    
    

    def train_test_spliting(self):
        train, test=train_test_split(self.transformed_dataset)
        train.to_csv(os.path.join(self.config.root_dir,"train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir,"test.csv"), index=False)

        logger.info("Splitted data into training and test sets")

    

    
        

In [None]:
try:
    config=ConfigurationManager()
    data_transformation_config= config.get_data_transformation_config()
    data_transformation=DataTransformation(config=data_transformation_config,threshold=0.87)
    data_transformation.remove_correlated_columns()
    data_transformation.data_standardization()
    data_transformation.train_test_spliting()
except Exception as e:
    raise IrisPredictionException(e, sys)