# Data Cleaning
The data cleaning process will involve building pipelines to detect and handle outlier and missing data. This is particularly important because you don’t want to skew our analysis. 

In [3]:
#import
import sys
import os
import pandas as pd
import numpy as np
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt


In [38]:
#configuration
sys.path.append(os.path.abspath(os.path.join('..')))
sns.set()

In [28]:
from src.data.manipulate import Manipulate

In [22]:
#read raw store data set
store_df = pd.read_csv('../data/raw/store.csv')
store_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   PromoInterval              571 non-null    object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB


In [23]:
#read raw test data set
test_df = pd.read_csv('../data/raw/test.csv')
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41088 entries, 0 to 41087
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             41088 non-null  int64  
 1   Store          41088 non-null  int64  
 2   DayOfWeek      41088 non-null  int64  
 3   Date           41088 non-null  object 
 4   Open           41077 non-null  float64
 5   Promo          41088 non-null  int64  
 6   StateHoliday   41088 non-null  object 
 7   SchoolHoliday  41088 non-null  int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 2.5+ MB


In [24]:
#read train data set
train_df = pd.read_csv('../data/raw/train.csv', low_memory=False)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41088 entries, 0 to 41087
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             41088 non-null  int64  
 1   Store          41088 non-null  int64  
 2   DayOfWeek      41088 non-null  int64  
 3   Date           41088 non-null  object 
 4   Open           41077 non-null  float64
 5   Promo          41088 non-null  int64  
 6   StateHoliday   41088 non-null  object 
 7   SchoolHoliday  41088 non-null  int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 2.5+ MB


In [33]:
manipulate_store = Manipulate(store_df)
manipulate_train  = Manipulate(train_df)
manipulate_test = Manipulate(test_df)

## Handle missing data

1. Handle missing data in store data sets

In [30]:
#missing data count
store_df.isna().sum()

Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64

In [34]:
# Fill missing numeric values
manipulate_store.fill_columns_with_max(store_df.select_dtypes(exclude=['object']).columns.tolist())
#Fill missing non-numeric (categorical values)
manipulate_store.fill_columns_with_most_frequent(store_df.select_dtypes(include=['object']).columns.tolist())

In [35]:
store_df.isna().sum()

Store                        0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
dtype: int64

2. Handle missing data in test data sets

In [17]:
# missing data count
test_df.isna().sum()

Id                0
Store             0
DayOfWeek         0
Date              0
Open             11
Promo             0
StateHoliday      0
SchoolHoliday     0
dtype: int64

In [36]:

# Fill missing numeric values
manipulate_test.fill_columns_with_max(test_df.select_dtypes(exclude=['object']).columns.tolist())
#Fill missing non-numeric (categorical values)
manipulate_test.fill_columns_with_most_frequent(test_df.select_dtypes(include=['object']).columns.tolist())

In [37]:
test_df.isna().sum()

Id               0
Store            0
DayOfWeek        0
Date             0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

3. Handle missing data in the train data sets

In [18]:
train_df.isna().sum()

Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

There no missing data in the train data sets

## Save clean data

In [39]:
store_df.to_csv('../data/clean/store.csv')
train_df.to_csv('../data/clean/train.csv')
test_df.to_csv('../data/clean/test.csv')