__Project:__ Simple DNN and CNN <br>
__Sub-prj:__ DNN Regression with Keras <br>
__Experm:__ Data Prep & Regression on Tabular Data (Big Mart Sales Pred) <br>
__Devl by:__ Amir Hossini <br>
__Dev Dat:__ Oct 9, 2021 <br>

![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)

#### Libraries

In [14]:
import os
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

print(f'SciKitLearn Version: {sklearn.__version__}')
print(f'Tensorflow Version: {tf.__version__}')
print('Number of available GPUs: {}'.format(len(tf.config.list_physical_devices('GPU'))))

SciKitLearn Version: 1.0
Tensorflow Version: 2.5.0
Number of available GPUs: 1


#### I/O Files & Params

In [18]:
fl_train     = './datasets/big_mart_sales/train.csv'
fl_test      = './datasets/big_mart_sales/test.csv'

excl_list    = ['Item_Identifier']
ordinal_list = ['Outlet_Location_Type']
nominal_list = ['Item_Fat_Content','Item_Type','Outlet_Identifier','Outlet_Establishment_Year','Outlet_Size','Outlet_Type']

ordinal_labels_dict = {
    'Outlet_Location_Type':{'Tier 1': 0, 'Tier 2': 1, 'Tier 3': 2},
}

item_fat_content_map = {
    'Low Fat':'LF', 'Regular':'Reg', 'low fat':'LF', 'LF':'LF', 'reg':'Reg'
}

target_var = ['Item_Outlet_Sales']

val_split = 0.3

#### Functions

In [16]:
def prep_ordinal(dataframe,columns,ordinal_labels_dict):
    df = dataframe.copy()
    for col in columns:
        label_dict = ordinal_labels_dict[col]
        df.loc[:,col] = df.loc[:,col].map(lambda x: label_dict[x])
    return df
    
def prep_nominal(dataframe,columns):
    df = dataframe.copy()
    for col in columns:
        tmp_df=pd.get_dummies(df[col],prefix=col)
        tmp_df=tmp_df.iloc[: , :-1]
        df.drop(columns=[col],inplace=True)
        df=pd.concat([df,tmp_df],axis=1)
    return df

def prep_normalize(dataframe):
    df = dataframe.copy()
    df_scaled=pd.DataFrame(MinMaxScaler().fit_transform(df),columns=df.columns)
    return df_scaled

def split_data(dataframe,target_col,test_split_prop=0.2):
    df = dataframe.copy()
    X  = df.drop(columns=target_col)
    y  = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_split_prop)
    return X_train, X_test, y_train, y_test

#### Read In & Stats

In [4]:
train = pd.read_csv(fl_train)
test  = pd.read_csv(fl_test)

print(f'Train Size: {len(train)}')
print(f'Test Size: {len(test)}')
print('\nPercentage of missing: Train')
print(train.isna().sum()/len(train)*100)

print('\nPercentage of missing: Test')
print(test.isna().sum()/len(test)*100)

Train Size: 8523
Test Size: 5681

Percentage of missing: Train
Item_Identifier               0.000000
Item_Weight                  17.165317
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.276428
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales             0.000000
dtype: float64

Percentage of missing: Test
Item_Identifier               0.000000
Item_Weight                  17.180074
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.269671
Outlet_Location_Type          0.000000
Outlet_Type                   0.000

#### Missing data handling & cleaning

In [5]:
train.loc[train['Item_Weight'].isna(),'Item_Weight'] = train['Item_Weight'].mean()
test.loc[test['Item_Weight'].isna(),'Item_Weight'] = test['Item_Weight'].mean()

train.loc[train['Outlet_Size'].isna(),'Outlet_Size'] = 'Unknown'
test.loc[test['Outlet_Size'].isna(),'Outlet_Size'] = 'Unknown'

train.loc[:,'Item_Fat_Content']=train.loc[:,'Item_Fat_Content'].map(lambda x: item_fat_content_map[x])
test.loc[:,'Item_Fat_Content']=test.loc[:,'Item_Fat_Content'].map(lambda x: item_fat_content_map[x])

train.drop(columns=['Item_Identifier'],inplace=True)
test.drop(columns=['Item_Identifier'],inplace=True)

#### Feature Engineering

In [7]:
train=(train.pipe(prep_ordinal,ordinal_list,ordinal_labels_dict)
    .pipe(prep_nominal,nominal_list)
    .pipe(prep_normalize)
)

test=(test.pipe(prep_ordinal,ordinal_list,ordinal_labels_dict)
    .pipe(prep_nominal,nominal_list)
    .pipe(prep_normalize)
)

In [23]:
X_train, X_val, y_train, y_val = split_data(train,target_var,val_split)
n_features = len(X_train.columns)