# A South American Grocery Store needs prediction of how many items they would sell on certain dates. (NW RMSLE)

In [45]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import os
import math
import re
import pandas as pd
import numpy as np
from sklearn import metrics
from pandas.api.types import is_string_dtype, is_numeric_dtype

In [46]:
PATH = "data/groceries/"

In [47]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False,
                    parse_dates = ['date'], nrows = 10000000)

### *note, since my computer can't handle over 100 million units of food, I must break it to 10 million.

In [48]:
df_raw.tail()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
999995,999995,2013-01-26,43,716250,6.0,
999996,999996,2013-01-26,43,723184,2.0,
999997,999997,2013-01-26,43,724498,3.0,
999998,999998,2013-01-26,43,730258,2.0,
999999,999999,2013-01-26,43,730259,3.0,


In [49]:
df_raw.head(3)

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,0,2013-01-01,25,103665,7.0,
1,1,2013-01-01,25,105574,1.0,
2,2,2013-01-01,25,105575,2.0,


## Here we have 6 columns but for hundreds of thousands of items(too big for most csv readers).

## The main purpose of this project is to learn how to separate the data into multiple trees and then test for at least 50% accuracy

In [50]:
df_raw['date'].head(3)

0   2013-01-01
1   2013-01-01
2   2013-01-01
Name: date, dtype: datetime64[ns]

# Feature Engineering
## Starting with df_raw['date']. 

In [51]:
def add_datepart(df, fldname):
    fld= df[fldname] 
    #not df.fldname, that would just grab a field that states 'fldname'. also safer and should do more than this.
    targ_pre = re.sub('[Dd]ate$', '', fldname)#regex removes 'date' from fldname
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek','Dayofyear',#goes through each string
              'Is_month_end','Is_month_start', 'Is_quarter_end', 'Is_year_end', 'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
        #tries to find an attribute with the same name inside each object(like Year attr)
    df[targ_pre+'Elapsed'] = (fld - fld.min()).dt.days
    df.drop(fldname, axis=1, inplace=True)

In [52]:
fld = df_raw.date

In [53]:
add_datepart(df_raw, 'date')

In [54]:
def isHoliday(day, month):#later add year to this.
    fixedHolDates = ([1,1], [7,9],[25,12],[28,7],[20,7],[12,11],[8,12])#in [day, month]
    if [day, month] in fixedHolDates:
        return True;
    return False;

## Preparing for Prediction

In [98]:
def train_cats(df):
    for n, c in df.items():#row, object
        if is_string_dtype(c):#checks if string.
            df[n] = c.astype('category').cat.as_ordered()#sets the categorical value for ordering.

In [99]:
train_cats(df_raw)

In [100]:
os.makedirs('tmp', exist_ok =True)
df_raw.to_feather('tmp/raw')

In [101]:
def proc_df(df, y_fld, skip_flds =None, do_scale = False,
           preproc_fn = None, max_n_cat = None, subset = None):
    if not skip_flds:#copies df, grabs y values, drops the original, and then does fix_missing
        skip_flds = []
    if subset:
        df= get_sample(df, subset)
    df = df.copy()
    if preproc_fn:
        preproc_fn(df)
    y = df[y_fld].values
    df.drop(skip_flds+[y_fld], axis=1, inplace =True)
    
    for n, c in df.items():
        fix_missing(df, c, n)
    if do_scale:
        mapper = scale_vars(df)
    for n, c in df.items():
        numericalize(df, c, n, max_n_cat)
    res = [pd.get_dummies(df, dummy_na = True), y]#discuss later
    if not do_scale: return res
    return res + [mapper]

In [102]:
def fix_missing(df, col, name):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum():
            df[name+'_na'] = pd.isnull(col)
        df[name] = col.fillna(col.median())# replaced with median, and replaces with new column telling what's missing

In [103]:
def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and (max_n_cat is None or col.nunique()>max_n_cat):
        df[name] = col.cat.codes+1# if it isn't numeric, we replace with its codes.
    

In [104]:
df, y =proc_df(df_raw, 'unit_sales')

In [105]:
y

array([7., 1., 2., ..., 3., 2., 3.])

In [106]:
def rmse(x,y):
    return math.sqrt(((x-y)**2).mean())
def print_score(m):
    res = [rmse(m.predict(X_train), Y_train), rmse(m.predict(X_valid), Y_valid),
          m.score(X_train, Y_train), m.score(X_valid, Y_valid)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)

# Implementing R.Forest Regressor

In [108]:
def split_vals(a, n):
    return a[:n].copy(), a[n: ].copy()

n_valid = 100000 #same as Kaggle's test size
n_trn = len(df)- n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
Y_train, Y_valid = split_vals(y, n_trn)

X_train.shape, Y_train.shape, X_valid.shape

((900000, 16), (900000,), (100000, 16))

In [109]:
m = RandomForestRegressor(n_estimators = 20, max_depth = 3, bootstrap = False, n_jobs = -1)
%time m.fit(X_train, Y_train)

Wall time: 5.17 s


RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [110]:
print_score(m)

[19.929884211916004, 14.485042046163748, 0.029105503771927776, 0.007274564362931946]


# Posting the metrics