In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import numpy as np
import warnings
import sys

In [None]:
# suppressing warning messages
warnings.filterwarnings('ignore')

In [None]:
# importing scripts
sys.path.insert(1, '../scripts')

from data_viz import Data_Viz
from data_cleaning import DataCleaner
from data_transformation import DataTransformer

DV = Data_Viz()
DC = DataCleaner()
DT = DataTransformer()


In [None]:
# importing datasets

sample = pd.read_csv("../data/sample_submission.csv")
store = pd.read_csv("../data/store.csv")
test = pd.read_csv("../data/test.csv")
train = pd.read_csv("../data/train.csv")

In [None]:
# checking sample dataset
print(sample.shape)
DV.summ_columns(sample)

In [None]:
print(store.shape)
DV.summ_columns(store)

In [None]:
# checking if the promo dates are 0 becuase promo2 is 0

promo2_1_df = store.loc[store['Promo2'] == 1]
DV.summ_columns(promo2_1_df)

In [None]:
store['PromoInterval'].value_counts()

In [None]:
# replacing empty values with 0 and NA
store_clean = store.copy(deep=True)
store_clean['Promo2SinceWeek'] = store['Promo2SinceWeek'].fillna(0)
store_clean['Promo2SinceYear'] = store['Promo2SinceYear'].fillna(0)
store_clean['PromoInterval'] = store['PromoInterval'].fillna('NA')

In [None]:
# removing columns with more than 30% missing value

store_clean = DC.reduce_dim_missing(store_clean, 30)

In [None]:
# checking cleaned data 

DV.summ_columns(store_clean)

In [None]:
# merging datasets
train_store = pd.merge(train, store_clean, how = 'left', on = "Store")
test_store = pd.merge(test, store_clean, how='left', on = 'Store')

In [None]:
# checking store dataset
print(train_store.shape)
DV.summ_columns(train_store)

In [None]:
print(test_store.shape)
DV.summ_columns(test_store)

In [None]:
# checking for outliers

DV.plot_box2(train_store, ['Sales', 'Customers'])

**Observation**
- sales and customers have outliers

In [None]:
# a pipeline to handle missing values and outliers.
pipe = Pipeline(steps=[ ("Handle numerical missing values", FunctionTransformer(DC.fill_missing_by_median)),
                        ("Handle categorical missing values", FunctionTransformer(DC.fill_missing_by_mode)),
                        ("Handle outlier", FunctionTransformer(DC.fill_outliers_mean, kw_args={'cols':['Sales', 'Customers']}))])

In [None]:
# running the train dataset in the cleaning pipe

train_store_clean = pipe.fit_transform(train_store)

DV.summ_columns(train_store_clean)

In [None]:
# checking if outliers are handled

DV.plot_box2(train_store_clean, ['Sales', 'Customers'])

In [None]:
# saving cleaned dataframes

train_store_clean.to_csv("../data/train_store.csv", index = False)
train_store_clean.to_csv("../data/test_store.csv", index = False)