In [1]:
import os

os.chdir("../../")
%pwd

'c:\\Users\\anfe1\\OneDrive\\Escritorio\\Instaleap\\Instamarket'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataPreparationConfig:
    root_dir: Path
    data_path: Path

In [5]:
from instamarket.constants import CONFIG_FILE_PATH
from instamarket.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self) -> None:
        config_file_path = CONFIG_FILE_PATH

        self.config = read_yaml(config_file_path)

        create_directories([self.config.artifacts_root])
    
    def get_data_preparation_config(self) -> DataPreparationConfig:
        config = self.config.data_preparation

        create_directories([config.root_dir])

        data_preparation_config = DataPreparationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path
        )

        return data_preparation_config

In [12]:
import pandas as pd

from sklearn.model_selection import train_test_split

from instamarket.logging import logger

class DataPreparation:
    def __init__(self, config:DataPreparationConfig) -> None:
        self.config = config

    def prepare_data(self) -> pd.DataFrame:
        logger.info("Read the dataset as dataframe")
        df = pd.read_csv(self.config.data_path)

        logger.info("Converting picking times to datetime DType")
        datetime_cols = ["optimal_start_time_picking",
                         "optimal_end_time_picking"]
        df[datetime_cols] = df[datetime_cols].apply(pd.to_datetime, format="mixed")

        df_prepared = df[["store_id","optimal_total_time","start_delay","end_delay"]]

        df_prepared['optimal_start_day'] = df['optimal_start_time_picking'].dt.day
        df_prepared['optimal_start_hour'] = df['optimal_start_time_picking'].dt.hour
        df_prepared['optimal_start_minute'] = df['optimal_start_time_picking'].dt.minute
        df_prepared['optimal_start_weekday'] = df['optimal_start_time_picking'].dt.weekday
        df_prepared['optimal_start_is_weekend'] = df_prepared['optimal_start_weekday'].isin([5, 6])

        df_prepared['optimal_end_day'] = df['optimal_end_time_picking'].dt.day
        df_prepared['optimal_end_hour'] = df['optimal_end_time_picking'].dt.hour
        df_prepared['optimal_end_minute'] = df['optimal_end_time_picking'].dt.minute
        df_prepared['optimal_end_weekday'] = df['optimal_end_time_picking'].dt.weekday
        df_prepared['optimal_end_is_weekend'] = df_prepared['optimal_end_weekday'].isin([5, 6])

        return df_prepared

    def split_data(self):
        df = self.prepare_data()

        logger.info("Train Test split initiated")
        train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

        train_set.to_csv(os.path.join(self.config.root_dir,"train.csv"), index=False, header=True)
        test_set.to_csv(os.path.join(self.config.root_dir,"test.csv"), index=False, header=True)
        logger.info("Train Test split saved")

In [13]:
try:
    config = ConfigurationManager()
    data_preparation_config = config.get_data_preparation_config()
    data_preparation = DataPreparation(config=data_preparation_config)
    data_preparation.split_data()
except Exception as e:
    raise e

[2024-04-20 01:44:28,759] 28 common - INFO - yaml file config\config.yml loaded successfully
[2024-04-20 01:44:28,759] 46 common - INFO - Created directory at: artifacts
[2024-04-20 01:44:28,759] 46 common - INFO - Created directory at: artifacts/data_preparation
[2024-04-20 01:44:28,759] 12 2142308596 - INFO - Read the dataset as dataframe
[2024-04-20 01:44:29,292] 15 2142308596 - INFO - Converting picking times to datetime DType
[2024-04-20 01:44:29,708] 39 2142308596 - INFO - Train Test split initiated


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prepared.loc[:, 'optimal_start_day'] = df.loc[:, 'optimal_start_time_picking'].dt.day


[2024-04-20 01:44:32,077] 44 2142308596 - INFO - Train Test split saved
