In [1]:
import os

In [2]:
%pwd

'd:\\Full Stack Data Science\\Time Series Analysis\\MAJOR PROJECT\\SMDF\\research'

In [3]:
cd ..

d:\Full Stack Data Science\Time Series Analysis\MAJOR PROJECT\SMDF


In [4]:
%pwd

'd:\\Full Stack Data Science\\Time Series Analysis\\MAJOR PROJECT\\SMDF'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    sales_data: Path

In [6]:
from SMDF.constants import *
from SMDF.utils.common import read_yaml, create_directories

In [11]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            sales_data=config.sales_data
        )

        return data_transformation_config

In [12]:
import os
from SMDF.logging import logger
import pandas as pd

In [13]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    
    def get_data_transform(self):
        data_1 = pd.read_csv(self.config.data_path)
        sales = pd.read_csv(self.config.sales_data)
        sales[["datum","M01AB","M01AE","N02BA","N02BE","N05B","N05C","R03","R06"]] = sales[["datum","mo1ab","mo1ae","no2ba","no2be","no5b","no5c","r06","ro3"]]
        data=pd.concat([data_1,sales])
        missing = data.isnull().sum()
        logger.info(missing)
        num_column =["M01AB","M01AE","N02BA","N02BE","N05B","N05C","R03","R06"]
        data[num_column].fillna("median",inplace=True)
        logger.info("filling Missing value done sucessfully !")
        data.set_index("datum",inplace=True)
        dict_lower = {}
        dict_upper = {}
        targets = ["M01AB","M01AE","N02BA","N02BE","N05B","N05C","R03","R06"]
        for i,var in enumerate(targets):
            irq = data[var].quantile(0.75) - data[var].quantile(0.25)
            lower_bridge = data[var].quantile(0.25) -(irq*1.5)
            upper_bridge = data[var].quantile(0.75) + (irq*1.5)
            print(f"Lower bound of {targets[i]}:{lower_bridge}")
            print(f"Upper bound of {targets[i]}:{upper_bridge}")
            print("**"*20)
            dict_lower[targets[i]] = [lower_bridge]
            dict_upper[targets[i]] = [upper_bridge]
        
        logger.info(f"suceesfully find upper and Lower bound")
        outlier_clmn = [] 
        for i in targets:
            x = [int(x) for x in dict_lower[i]]
            y = [int(y) for y in dict_upper[i]]
            
            if (x[0] and y[0]) > data[i].min():
                outlier_clmn.append(i)
        for i in outlier_clmn:
            x = [int(x) for x in dict_upper[i]]
            data.loc[data[i] >= x[0],i] = int(x[0])

        return data.to_csv("artifacts/data_transformation/salesDaily.csv",index=True)
        
        
    
    


In [15]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.get_data_transform()
except Exception as e:
    raise e

[2023-07-05 11:02:44,053: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-07-05 11:02:44,068: INFO: common: yaml file: params.yaml loaded successfully]
[2023-07-05 11:02:44,079: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-07-05 11:02:44,082: INFO: common: created directory at: artifacts]
[2023-07-05 11:02:44,085: INFO: common: created directory at: artifacts/data_transformation]


[2023-07-05 11:02:44,298: INFO: 3265334080: datum              0
M01AB              0
M01AE              0
N02BA              0
N02BE              0
N05B               0
N05C               0
R03                0
R06                0
Year               1
Month              1
Hour               1
Weekday Name       1
mfpid           2106
mo1ab           2106
mo1ae           2106
no2ba           2106
no2be           2106
no5b            2106
no5c            2106
r06             2106
ro3             2106
user_id         2106
dtype: int64]
[2023-07-05 11:02:44,304: INFO: 3265334080: filling Missing value done sucessfully !]
Lower bound of M01AB:-2.505
Upper bound of M01AB:12.175
****************************************
Lower bound of M01AE:-1.8539999999999992
Upper bound of M01AE:9.329999999999998
****************************************
Lower bound of N02BA:-2.8000000000000007
Upper bound of N02BA:10.0
****************************************
Lower bound of N02BE:-9.949999999999996
Upper b

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[num_column].fillna("median",inplace=True)
