# Load Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set(style="whitegrid")
sns.set_context("notebook")

In [2]:
from IPython.display import Markdown, display

OUT = "output"
import os
os.makedirs(OUT, exist_ok=True)
SHOW_PLOTS = True
TEST_TRAIN_SEED = 666

def summarise_dataframe(df):
    display(Markdown("\n**Dataframe (%s x %s)**\n" % df.shape), df.head())
    display(Markdown("\n**Missing Values**"), df.isna().sum())
    display(Markdown("\n**dtypes**"), df.dtypes)

In [3]:
display_fraction = lambda n,d: (n/d*100, n, d)

# Load and Prepare the Data

In [4]:
FORCE = True 
DEBUG = False
if not os.path.isdir("data"): os.makedirs("data", exist_ok=True)
    
df_test = pd.read_csv("src/df_test.csv")
summarise_dataframe(df_test)


**Dataframe (4000 x 13)**


Unnamed: 0,index,Product_Code,Product_Weight,Product_Fat_Content,Product_Salt_Content,Product_Visibility,Product_Type,Product_MRP,Store_Code,Store_Opened,Store_Size,Store_Location_Type,Store_Type
0,2194,FOOD-56-36,7.72,Regular,Regular,0.228457,Fruits and Vegetables,88.49825,S-0522,2009,Medium,Tier 3,Supermarket Type2
1,2916,FOOD-63-02,,High Salt,Regular,0.315803,Fruits and Vegetables,212.42125,S-0551,1985,Small,Tier 1,Grocery Store
2,4012,FOOD-02-32,8.155,r,High Salt,0.183367,Frozen Foods,84.41925,S-1305,2002,,2,Supermarket Type1
3,2966,NCON-38-15,9.06,,Regular,0.098834,Others,254.32,S-0377,1987,High,Tier 3,Supermarket Type1
4,2866,FOOD-64-02,15.7,High Salt,High Salt,0.137695,Fruits,66.317,S-1421,1999,Medium,Tier 1,Supermarket Type1



**Missing Values**

index                      0
Product_Code               0
Product_Weight           675
Product_Fat_Content      778
Product_Salt_Content       0
Product_Visibility         0
Product_Type               0
Product_MRP                0
Store_Code                 0
Store_Opened               0
Store_Size              1149
Store_Location_Type        0
Store_Type                 0
dtype: int64


**dtypes**

index                     int64
Product_Code             object
Product_Weight          float64
Product_Fat_Content      object
Product_Salt_Content     object
Product_Visibility      float64
Product_Type             object
Product_MRP             float64
Store_Code               object
Store_Opened              int64
Store_Size               object
Store_Location_Type      object
Store_Type               object
dtype: object

# Pre-EDA EDA and Data cleaning

In [5]:
# From previous section 3 columns have missing data, and there are 8 categorical variables
# Store_Type, Store_Location_Type has at least 3 different types
# Store_Size -> small, medium, high and nan

# Product_Fat_Content depending on Product_Type == others won't have fat 

index = [
    "Product_Code",
    "Product_Weight",
    "Product_Fat_Content",
    "Product_Salt_Content",
    "Product_Visibility",
    "Product_Type",
    "Product_MRP",
    "Store_Code",
    "Store_Opened",
    "Store_Size",
    "Store_Location_Type",
    "Store_Type"
]
target = "Product_Store_Sales"

## Store_Type

In [6]:
df_test.Store_Type.unique()

array(['Supermarket Type2', 'Grocery Store', 'Supermarket Type1',
       'Supermarket Type3'], dtype=object)

## Store_Location_Type

# Needs to be fixed to three instead of six
df_test.Store_Location_Type.unique()

In [8]:
df_test.Store_Location_Type = df_test.Store_Location_Type.apply(lambda x: x.split()[-1] if x in ["Tier 1", "Tier 2", "Tier 3"] else x.strip())

In [9]:
df_test.Store_Location_Type.unique()

array(['3', '1', '2'], dtype=object)

## Product_Fat_Content

In [10]:
# Needs to be fixed to low, regular, high, nan
df_test.Product_Fat_Content.unique()

array(['Regular', 'High Salt', 'r', nan, 'low Salt', 'regular',
       'Low Salt', 'Lw salt', 'LS', 'Low'], dtype=object)

In [11]:
def replaceRow(row):
    row = row.lower()
    if row in ["regular", "r"]:
        row = "r"
    elif row == "high salt":
        row = "h"
    else:
        row = "l"
    return row

df_test.Product_Fat_Content = df_test.Product_Fat_Content.apply(lambda x: replaceRow(x) if type(x) == str else x)

In [12]:
df_test.Product_Fat_Content.unique()

array(['r', 'h', nan, 'l'], dtype=object)

## Product_Type

In [32]:
# contains duplicates
# temp = df_test.Product_Type.unique()
# types = []
# for c in temp:
#    types.append(c.replace(" ", "_"))
# types
df_test.Product_Type = df_test.Product_Type.apply(lambda x: x.replace(" ", "_"))
df_test.Product_Type.unique()

array(['Fruits_and_Vegetables', 'Frozen_Foods', 'Others', 'Fruits',
       'Health_and_Hygiene', 'Baking_Goods', 'Household', 'Vegetables',
       'Snack_Foods', 'Breads', 'Canned', 'Hard_Drinks', 'Soft_Drinks',
       'Meat', 'Dairy', 'Starchy_Foods', 'Breakfast', 'Snacks', 'Health',
       'Cosmetics', 'Seafood'], dtype=object)

In [30]:
# df = male_trips.groupby('start_station_id').size()
for t in temp:
    print(df_test.groupby(t).size())

KeyError: 'Fruits and Vegetables'

## Product_Salt_Content

In [14]:
df_test.Product_Salt_Content.unique()

array(['Regular', 'High Salt', 'Low Salt'], dtype=object)

## NAN

### Product_Weight

In [None]:
# Product_Weight -> Product_Type

### Product_Fat_Content 

### Store_Size