In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

## Loading Data 

In [2]:
bigmart_train_df = pd.read_csv('train_v9rqX0R.csv')
bigmart_train_df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [3]:
bigmart_train_df['Item_Identifier'].value_counts()

FDW13    10
FDG33    10
FDW26     9
NCF42     9
FDD38     9
         ..
FDO33     1
FDQ60     1
FDK57     1
FDN52     1
FDE52     1
Name: Item_Identifier, Length: 1559, dtype: int64

In [4]:
for i in bigmart_train_df[bigmart_train_df['Item_Identifier']=='FDE52'].Item_Weight:
    print(i)

nan


In [5]:
bigmart_train_df.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

In [6]:
# feature_df = bigmart_train_df[['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
#        'Item_Type', 'Item_MRP', 'Outlet_Identifier',
#        'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
#        'Outlet_Type']]
# X = np.asarray(feature_df)
# y=np.asarray(bigmart_train_df['Item_Outlet_Sales'])

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(bigmart_train_df, test_size=0.2,random_state=200)

In [8]:
train.isnull().sum()

Item_Identifier                 0
Item_Weight                  1180
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1939
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [9]:
item_id = train.Item_Identifier.unique()

In [10]:
dict_weight={}
for i in item_id:
    temp = (train[train['Item_Identifier']==i].Item_Weight.unique())
    for j in temp:
        if np.isnan(j)==False:
            if i not in dict_weight:
                dict_weight[i]=j

In [11]:
# feature_df = train[['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
#        'Item_Type', 'Item_MRP', 'Outlet_Identifier',
#        'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
#        'Outlet_Type']]
X = np.asarray(train)
# y=np.asarray(bigmart_train_df['Item_Outlet_Sales'])
X    

array([['FDA20', 6.78, 'Low Fat', ..., 'Tier 2', 'Supermarket Type1',
        5406.295999999999],
       ['FDV28', 16.1, 'Regular', ..., 'Tier 3', 'Supermarket Type2',
        203.7348],
       ['FDJ60', 19.35, 'Regular', ..., 'Tier 2', 'Supermarket Type1',
        2807.0128],
       ...,
       ['NCK07', 10.65, 'Low Fat', ..., 'Tier 3', 'Supermarket Type2',
        1973.4312],
       ['FDO45', 13.15, 'Regular', ..., 'Tier 2', 'Supermarket Type1',
        1757.7120000000002],
       ['FDY28', 7.47, 'Regular', ..., 'Tier 3', 'Supermarket Type1',
        1496.0526]], dtype=object)

In [12]:
for i in X:
    if np.isnan(i[1]):
        if i[0] in dict_weight:
            i[1] = dict_weight[i[0]]
        else:
            print(i[0])

FDE52
FDN52
NCT53
DRI59
NCT53
FDK57
FDK32
NCU29


### Seems that the missing values are not random. The Outlet Identifier seems to be highly correlated.

In [13]:
train.loc[train['Item_Identifier'].isin(['FDE52','FDN52','NCT53','DRI59','NCT53','FDK57','NCU29'])]

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
4187,FDE52,,Regular,0.029742,Dairy,88.9514,OUT027,1985,Medium,Tier 3,Supermarket Type3,3453.5046
927,FDN52,,Regular,0.130933,Frozen Foods,86.9198,OUT027,1985,Medium,Tier 3,Supermarket Type3,1569.9564
7387,NCT53,,Low Fat,0.084245,Health and Hygiene,164.6526,OUT019,1985,Small,Tier 1,Grocery Store,657.8104
5954,DRI59,,Low Fat,0.040637,Hard Drinks,224.6088,OUT027,1985,Medium,Tier 3,Supermarket Type3,4474.176
4734,NCT53,,low fat,0.0,Health and Hygiene,164.5526,OUT027,1985,Medium,Tier 3,Supermarket Type3,2302.3364
1922,FDK57,,Low Fat,0.079904,Snack Foods,120.044,OUT027,1985,Medium,Tier 3,Supermarket Type3,4434.228
7333,NCU29,,Low Fat,0.044608,Health and Hygiene,145.976,OUT019,1985,Small,Tier 1,Grocery Store,585.904


In [14]:
train = pd.DataFrame(X,columns=['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'])

In [15]:
# deleting rows with Item Weight == nan 
train = train.dropna(axis=0, subset=['Item_Weight'])

#### Outlet_Type -- Tier 2 and Tier 3 have missing Outlet_Size values
### Outlet_Location_Type = Tier 2 and Outlet_Type = Supermarket Type1 --> Outlet_Size = Small
### Outlet_Location_Type = Tier 3 and Outlet_Type = Supermarket Type1 --> Outlet_Size = High
### Outlet_Location_Type = Tier 3 and Outlet_Type = Grocery Store --> Outlet_Size = Small
'Supermarket Type1', 'Grocery Store'
Outlet_Location_Type = Tier 3 and Outlet_Type = Supermarket Type3 --> Outlet_Size = Medium


In [16]:
# train.loc[(train['Outlet_Location_Type']=='Tier 1')&(train['Outlet_Type']=='Grocery Store')].Outlet_Size
# train.loc[(train['Outlet_Size'].isnull()==True)].Outlet_Type.unique()
# train.loc[(train['Outlet_Type']=='Grocery Store')].Outlet_Size.unique()

In [17]:
print(type(X[0][8]))
for i in X:
    if i[8] not in ['Small','Medium','High']:
        print("yes")
        if i[9] == 'Tier 2' and i[10] == 'Supermarket Type1':
            i[8] = 'Small'
        elif i[9] == 'Tier 3' and i[10] == 'Supermarket Type1':
            i[8] = 'High'
        elif i[9] == 'Tier 3' and i[10] == 'Grocery Store':
            i[8] = 'Small'

<class 'str'>
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
ye

In [18]:
X

array([['FDA20', 6.78, 'Low Fat', ..., 'Tier 2', 'Supermarket Type1',
        5406.295999999999],
       ['FDV28', 16.1, 'Regular', ..., 'Tier 3', 'Supermarket Type2',
        203.7348],
       ['FDJ60', 19.35, 'Regular', ..., 'Tier 2', 'Supermarket Type1',
        2807.0128],
       ...,
       ['NCK07', 10.65, 'Low Fat', ..., 'Tier 3', 'Supermarket Type2',
        1973.4312],
       ['FDO45', 13.15, 'Regular', ..., 'Tier 2', 'Supermarket Type1',
        1757.7120000000002],
       ['FDY28', 7.47, 'Regular', ..., 'Tier 3', 'Supermarket Type1',
        1496.0526]], dtype=object)

In [19]:
train = pd.DataFrame(X,columns=['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'])

In [20]:
train = train.dropna(axis=0, subset=['Item_Weight'])

In [21]:
train.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [22]:
train.Item_Fat_Content.unique()

array(['Low Fat', 'Regular', 'low fat', 'reg', 'LF'], dtype=object)