In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

In [3]:
DATA_DIR = "../../data/bigmart/"
df_raw = pd.read_csv(DATA_DIR + "train.csv", low_memory=False)
test_df = pd.read_csv(DATA_DIR + "test.csv", low_memory=False)

In [4]:
df_raw.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [5]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [6]:
display_all(df_raw.tail())

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.38,Regular,0.046982,Baking Goods,108.157,OUT045,2002,,Tier 2,Supermarket Type1,549.285
8520,NCJ29,10.6,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.21,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976
8522,DRG01,14.8,Low Fat,0.044878,Soft Drinks,75.467,OUT046,1997,Small,Tier 1,Supermarket Type1,765.67


In [7]:
display_all(df_raw.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Item_Identifier,8523,1559.0,FDG33,10.0,,,,,,,
Item_Weight,7060,,,,12.8576,4.64346,4.555,8.77375,12.6,16.85,21.35
Item_Fat_Content,8523,5.0,Low Fat,5089.0,,,,,,,
Item_Visibility,8523,,,,0.066132,0.0515978,0.0,0.0269895,0.0539309,0.0945853,0.328391
Item_Type,8523,16.0,Fruits and Vegetables,1232.0,,,,,,,
Item_MRP,8523,,,,140.993,62.2751,31.29,93.8265,143.013,185.644,266.888
Outlet_Identifier,8523,10.0,OUT027,935.0,,,,,,,
Outlet_Establishment_Year,8523,,,,1997.83,8.37176,1985.0,1987.0,1999.0,2004.0,2009.0
Outlet_Size,6113,3.0,Medium,2793.0,,,,,,,
Outlet_Location_Type,8523,3.0,Tier 3,3350.0,,,,,,,


In [8]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))

Item_Fat_Content             0.000000
Item_Identifier              0.000000
Item_MRP                     0.000000
Item_Outlet_Sales            0.000000
Item_Type                    0.000000
Item_Visibility              0.000000
Item_Weight                  0.171653
Outlet_Establishment_Year    0.000000
Outlet_Identifier            0.000000
Outlet_Location_Type         0.000000
Outlet_Size                  0.282764
Outlet_Type                  0.000000
dtype: float64

In [9]:
ifc_dict = { "low fat": "Low Fat", "LF": "Low Fat", "reg": "Regular" }
df_raw['Item_Fat_Content'] = df_raw['Item_Fat_Content'].apply(lambda x: ifc_dict.get(x, x))
df_raw['Item_Fat_Content'].unique()

ifc_dict = { "low fat": "Low Fat", "LF": "Low Fat", "reg": "Regular" }
df_raw['Outlet_Size'] = df_raw['Outlet_Size'].apply(lambda x: x if not pd.isna(x) else "Unknown" )
df_raw['Outlet_Size'].unique()

min_year = df_raw['Outlet_Establishment_Year'].min()
df_raw['Outlet_Establishment_Year'] = df_raw['Outlet_Establishment_Year'].apply(lambda x: x - min_year)

# df_raw['Item_Weight'] = df_raw['Item_Weight'].apply(lambda x: 0 if pd.isna(x) else x )
df_raw['Item_Visibility'] = df_raw['Item_Visibility'].apply(lambda x: 0 if pd.isna(x) else x )
df_raw['Item_MRP'] = df_raw['Item_MRP'].apply(lambda x: 0 if pd.isna(x) else x )
# df_raw['Outlet_Establishment_Year'] = df_raw['Outlet_Establishment_Year'].apply(lambda x: 0 if pd.isna(x) else x )

In [10]:
train_cats(df_raw)

In [11]:
df_raw.Item_Fat_Content.cat.categories

Index(['Low Fat', 'Regular'], dtype='object')

In [12]:
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/bigmart-raw')

In [13]:
df_raw = pd.read_feather('tmp/bigmart-raw')

  return feather.read_dataframe(path, nthreads=nthreads)


In [14]:
df, y, nas = proc_df(df_raw, 'Item_Outlet_Sales')

In [44]:
df.shape

(8523, 12)

In [45]:
set_rf_samples(3000)

In [46]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=5, n_jobs=-1, oob_score=True)
m.fit(df, y)
m.score(df,y)

0.6972087394824066

In [47]:
reset_rf_samples()

In [48]:
test_df = pd.read_csv(DATA_DIR + "test.csv", low_memory=False)

In [49]:
test_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [50]:
apply_cats(test_df, df_raw)

In [51]:
test_df.dtypes

Item_Identifier              category
Item_Weight                   float64
Item_Fat_Content             category
Item_Visibility               float64
Item_Type                    category
Item_MRP                      float64
Outlet_Identifier            category
Outlet_Establishment_Year       int64
Outlet_Size                  category
Outlet_Location_Type         category
Outlet_Type                  category
dtype: object

In [52]:
df_raw.dtypes

Item_Identifier              category
Item_Weight                   float64
Item_Fat_Content             category
Item_Visibility               float64
Item_Type                    category
Item_MRP                      float64
Outlet_Identifier            category
Outlet_Establishment_Year       int64
Outlet_Size                  category
Outlet_Location_Type         category
Outlet_Type                  category
Item_Outlet_Sales             float64
dtype: object

In [53]:
t_df, _, _ = proc_df(test_df)

In [54]:
y

array([3735.138 ,  443.4228, 2097.27  , ..., 1193.1136, 1845.5976,  765.67  ])

In [55]:
# ??proc_df

In [56]:
y_pred= m.predict(t_df)

In [57]:
t_df["Item_Outlet_Sales"] = y_pred
t_df.shape

(5681, 13)

In [58]:
test_df["Item_Outlet_Sales"] = y_pred
t_df.shape

(5681, 13)

In [59]:
submission = test_df[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]
submission.to_csv('submission2.csv', index=False) # writing data to a CSV file