In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.mstats import winsorize
import logging
import os, sys
# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))
# Set max rows and columns to display
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

logger.info("Imported libraries and configured logging.")

2024-09-24 19:16:13,400 - INFO - Imported libraries and configured logging.


In [6]:
# Import load_data
from load_data import load_data
if __name__ == "__main__":
    # Define file paths and names
    zip_file_path = '../Data/rossmann-store-sales.zip'
    extract_to_folder = '../Data/'
    train = 'train.csv'  # Replace with the actual file name if different
    test = 'test.csv'
    store = 'store.csv'

    # Load the dataset
    try:
        train_data = load_data(zip_file_path, train, extract_to_folder)
        test_data = load_data(zip_file_path, test, extract_to_folder)
        store_data =  load_data(zip_file_path, store, extract_to_folder)
        print("Data successfully loaded.")
        display(train_data.head())
    except FileNotFoundError as e:
        print(e)
logger.info("Data loaded successfully.")

  return pd.read_csv(file_path, index_col=0)


Data successfully loaded.


Unnamed: 0_level_0,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,5,2015-07-31,5263,555,1,1,0,1
2,5,2015-07-31,6064,625,1,1,0,1
3,5,2015-07-31,8314,821,1,1,0,1
4,5,2015-07-31,13995,1498,1,1,0,1
5,5,2015-07-31,4822,559,1,1,0,1


2024-09-24 19:17:28,961 - INFO - Data loaded successfully.


In [7]:
# Explore the testing data
test_data.head()

Unnamed: 0_level_0,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,4,2015-09-17,1.0,1,0,0
2,3,4,2015-09-17,1.0,1,0,0
3,7,4,2015-09-17,1.0,1,0,0
4,8,4,2015-09-17,1.0,1,0,0
5,9,4,2015-09-17,1.0,1,0,0


In [8]:
# Explore the store dataset
store_data.head()

Unnamed: 0_level_0,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,c,a,1270.0,9.0,2008.0,0,,,
2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
4,c,c,620.0,9.0,2009.0,0,,,
5,a,a,29910.0,4.0,2015.0,0,,,


In [9]:
logger.info("Merge the store data with train and test dataset")
# Merge store and train data
_train_data = train_data.merge(store_data, on='Store', how='left')
# Merege store and test data
_test_data = test_data.merge(store_data, on='Store', how='left')

2024-09-24 19:17:59,594 - INFO - Merge the store data with train and test dataset


In [10]:
# Info the train data
_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1017209 entries, 1 to 1115
Data columns (total 17 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   DayOfWeek                  1017209 non-null  int64  
 1   Date                       1017209 non-null  object 
 2   Sales                      1017209 non-null  int64  
 3   Customers                  1017209 non-null  int64  
 4   Open                       1017209 non-null  int64  
 5   Promo                      1017209 non-null  int64  
 6   StateHoliday               1017209 non-null  object 
 7   SchoolHoliday              1017209 non-null  int64  
 8   StoreType                  1017209 non-null  object 
 9   Assortment                 1017209 non-null  object 
 10  CompetitionDistance        1014567 non-null  float64
 11  CompetitionOpenSinceMonth  693861 non-null   float64
 12  CompetitionOpenSinceYear   693861 non-null   float64
 13  Promo2              

In [23]:

logger.info("Checking statistical summary of numerical data.")
# Statistical summary of numerical data
display(train_data.describe())
display(test_data.describe())

2024-09-24 19:46:19,252 - INFO - Checking statistical summary of numerical data.


Unnamed: 0,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear
count,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1014567.0,693861.0,693861.0,1017209.0,509178.0,509178.0
mean,3.998341,5773.819,633.1459,0.8301067,0.3815145,0.1786467,5430.086,7.222866,2008.690228,0.5005638,23.269093,2011.752774
std,1.997391,3849.926,464.4117,0.3755392,0.4857586,0.3830564,7715.324,3.211832,5.992644,0.4999999,14.095973,1.66287
min,1.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0,1900.0,0.0,1.0,2009.0
25%,2.0,3727.0,405.0,1.0,0.0,0.0,710.0,4.0,2006.0,0.0,13.0,2011.0
50%,4.0,5744.0,609.0,1.0,0.0,0.0,2330.0,8.0,2010.0,1.0,22.0,2012.0
75%,6.0,7856.0,837.0,1.0,1.0,0.0,6890.0,10.0,2013.0,1.0,37.0,2013.0
max,7.0,41551.0,7388.0,1.0,1.0,1.0,75860.0,12.0,2015.0,1.0,50.0,2015.0


Unnamed: 0,Store,DayOfWeek,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear
count,41088.0,41088.0,41077.0,41088.0,41088.0,40992.0,25872.0,25872.0,41088.0,23856.0,23856.0
mean,555.899533,3.979167,0.854322,0.395833,0.443487,5088.583138,7.03525,2008.641929,0.580607,24.426559,2011.820926
std,320.274496,2.015481,0.352787,0.489035,0.496802,7225.487467,3.143015,6.8624,0.493466,14.161312,1.692166
min,1.0,1.0,0.0,0.0,0.0,20.0,1.0,1900.0,0.0,1.0,2009.0
25%,279.75,2.0,1.0,0.0,0.0,720.0,4.0,2006.0,0.0,13.0,2011.0
50%,553.5,4.0,1.0,0.0,0.0,2425.0,7.0,2010.0,1.0,22.0,2012.0
75%,832.25,6.0,1.0,1.0,1.0,6480.0,9.0,2012.0,1.0,37.0,2013.0
max,1115.0,7.0,1.0,1.0,1.0,75860.0,12.0,2015.0,1.0,49.0,2015.0


In [24]:
# Check the shape of the dataset
print(f'Shape of training dataset:{_train_data.shape}')
print(f'Shape of testing dataset:{_test_data.shape} ')

Shape of training dataset:(1017209, 17)
Shape of testing dataset:(41088, 16) 


In [25]:
logger.info("Checking the types of both test and train data")
from data_processing import DataProcessing

# Create instance of the class
train_data = _train_data.copy()
test_data = _test_data.copy()
process = DataProcessing(test_data, train_data)
# Check the types of both test and train data
process.check_data_types()

2024-09-24 19:46:25,043 - INFO - Checking the types of both test and train data


TypeError: DataProcessing.__init__() takes 2 positional arguments but 3 were given