# A South American Grocery Store needs prediction of how many items they would sell on certain dates. (NW RMSLE)

In [1]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import os
import math
import re
import pandas as pd
import numpy as np
from sklearn import metrics
from pandas.api.types import is_string_dtype, is_numeric_dtype

  from numpy.core.umath_tests import inner1d


In [2]:
PATH = "data/groceries/"

In [3]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False,
                    parse_dates = ['date'], nrows = 10000000)

### *note, since my computer can't handle over 100 million units of food, I must break it to 10 million.

In [4]:
df_raw.tail()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
9999995,9999995,2013-08-21,1,999547,12.0,
9999996,9999996,2013-08-21,1,1001305,3.0,
9999997,9999997,2013-08-21,1,1004550,15.0,
9999998,9999998,2013-08-21,1,1004551,25.0,
9999999,9999999,2013-08-21,1,1009512,1.0,


In [5]:
df_raw.head(3)

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,0,2013-01-01,25,103665,7.0,
1,1,2013-01-01,25,105574,1.0,
2,2,2013-01-01,25,105575,2.0,


## Here we have 6 columns but for hundreds of millions of items(too big for most csv readers).

## The main purpose of this project is to learn how to separate the data into multiple trees and then test for at least 50% accuracy

In [6]:
df_raw['date'].head(3)

0   2013-01-01
1   2013-01-01
2   2013-01-01
Name: date, dtype: datetime64[ns]

# Feature Engineering
## Starting with df_raw['date']. 

In [7]:
def add_datepart(df, fldname):
    fld= df[fldname] 
    #not df.fldname, that would just grab a field that states 'fldname'. also safer and should do more than this.
    targ_pre = re.sub('[Dd]ate$', '', fldname)#regex removes 'date' from fldname
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek','Dayofyear',#goes through each string
              'Is_month_end','Is_month_start', 'Is_quarter_end', 'Is_year_end', 'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
        #tries to find an attribute with the same name inside each object(like Year attr)
    df[targ_pre+'Elapsed'] = (fld - fld.min()).dt.days
    df.drop(fldname, axis=1, inplace=True)

In [8]:
fld = df_raw.date

In [9]:
add_datepart(df_raw, 'date')

In [10]:
def isHoliday(day, month):#later add year to this.
    fixedHolDates = ([1,1], [7,9],[25,12],[28,7],[20,7],[12,11],[8,12])#in [day, month]
    if [day, month] in fixedHolDates:
        return True;
    return False;

## Preparing for Prediction

In [13]:
df_raw['unit_sales'] = np.log(df_raw['unit_sales'])

  """Entry point for launching an IPython kernel.


In [14]:
def train_cats(df):
    for n, c in df.items():#row, object
        if is_string_dtype(c):#checks if string.
            df[n] = c.astype('category').cat.as_ordered()#sets the categorical value for ordering.

In [15]:
train_cats(df_raw)

In [16]:
os.makedirs('tmp', exist_ok =True)
df_raw.to_feather('tmp/raw')

In [17]:
def proc_df(df, y_fld, skip_flds =None, do_scale = False,
           preproc_fn = None, max_n_cat = None, subset = None):
    if not skip_flds:#copies df, grabs y values, drops the original, and then does fix_missing
        skip_flds = []
    if subset:
        df= get_sample(df, subset)
    df = df.copy()
    if preproc_fn:
        preproc_fn(df)
    y = df[y_fld].values
    df.drop(skip_flds+[y_fld], axis=1, inplace =True)
    
    for n, c in df.items():
        fix_missing(df, c, n)
    if do_scale:
        mapper = scale_vars(df)
    for n, c in df.items():
        numericalize(df, c, n, max_n_cat)
    res = [pd.get_dummies(df, dummy_na = True), y]#discuss later
    if not do_scale: return res
    return res + [mapper]

In [18]:
def fix_missing(df, col, name):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum():
            df[name+'_na'] = pd.isnull(col)
        df[name] = col.fillna(col.median())# replaced with median, and replaces with new column telling what's missing

In [19]:
def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and (max_n_cat is None or col.nunique()>max_n_cat):
        df[name] = col.cat.codes+1# if it isn't numeric, we replace with its codes.
    

In [20]:
df, y =proc_df(df_raw, 'unit_sales')

In [21]:
y

array([1.94591015, 0.        , 0.69314718, ..., 2.7080502 , 3.21887582,
       0.        ])

In [26]:
def rmse(x,y):
    return math.sqrt(((x-y)**2).mean())
def print_score(m):
    res = [rmse(m.predict(X_train), Y_train), rmse(m.predict(X_valid), Y_valid),
          m.score(X_train, Y_train), m.score(X_valid, Y_valid)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)

In [35]:
df.head()

Unnamed: 0,id,store_nbr,item_nbr,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_year_end,Is_year_start,Elapsed,onpromotion_na
0,0,25,103665,2013,1,1,1,1,1,False,True,False,False,True,0,True
1,1,25,105574,2013,1,1,1,1,1,False,True,False,False,True,0,True
2,2,25,105575,2013,1,1,1,1,1,False,True,False,False,True,0,True
3,3,25,108079,2013,1,1,1,1,1,False,True,False,False,True,0,True
4,4,25,108701,2013,1,1,1,1,1,False,True,False,False,True,0,True


# Implementing R.Forest Regressor

In [31]:
def split_vals(a, n):
    return a[:n].copy(), a[n: ].copy()

n_valid = 100000 #same as Kaggle's test size
n_trn = len(df)- n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
Y_train, Y_valid = split_vals(y, n_trn)

X_train.shape, Y_train.shape, X_valid.shape

((9900000, 16), (9900000,), (100000, 16))

In [36]:
m = RandomForestRegressor(n_estimators = 20, max_depth = 3, bootstrap = False, n_jobs = -1)
%time m.fit(X_train, Y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [33]:
print_score(m)

NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

# Posting the metrics