# 1 Initialization

In [29]:
#from fastai.imports import *
#from fastai.structured import *
import os
import math
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import forest
from IPython.display import display
import re
from sklearn import metrics
import numpy as np

In [4]:
PATH = "../2/data/groceries/"
!ls {PATH}

holidays_events.csv
items.csv
oil.csv
sample_submission.csv
test.csv
train.csv
transactions.csv


# 2 Reading Data
## to ensure a header is assigned the correct datatypes, define the dictionary you want it to be mapped to.

In [5]:
types = {'id': 'int64',
        'item_nbr': 'int32',
        'store_nbr': 'int8',
        'unit_sales': 'float32',
        'onpromotion': 'object'}

In [6]:
%%time
df_all = pd.read_csv(f'{PATH}train.csv', parse_dates = ['date'], dtype = types,
                    infer_datetime_format = True)

Wall time: 4min 54s


### in practice, you don't need to read the whole thing in. use the linux command 'shuf'. This will parse a sample and the sample is what you use to work with instead of waiting for the entire data to come in.

## Now we will fill in missing values with false, since we know that there is na in the 'onpromotion' section

In [7]:
df_all['onpromotion'].fillna(False, inplace = True)# changes all NaN entries to False, and keeps anything that is an entry.
df_all['onpromotion'] = df_all['onpromotion'].map({'False' : False, 'True': True})#maps the entry 'False' to boolean False
df_all['onpromotion'] = df_all['onpromotion'].astype(bool)#changes the dtype to boolean for the entire column

In [8]:
%time df_all.to_feather('tmp/raw_groceries')

Wall time: 9.46 s


In [9]:
%time df_all.describe(include = 'all')# the param 'include' allows describing EVERYTHING. Useful for stats items

Wall time: 30.5 s


Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
count,125497000.0,125497040,125497000.0,125497000.0,125497000.0,125497040
unique,,1684,,,,2
top,,2017-07-01 00:00:00,,,,False
freq,,118194,,,,96028767
first,,2013-01-01 00:00:00,,,,
last,,2017-08-15 00:00:00,,,,
mean,62748520.0,,27.46458,972769.2,5.319669,
std,36227880.0,,16.33051,520533.6,23.06714,
min,0.0,,1.0,96995.0,-15372.0,
25%,31374260.0,,12.0,522383.0,2.0,


In [10]:
df_test = pd.read_csv(f'{PATH}test.csv', parse_dates = ['date'], dtype = types,
                     infer_datetime_format = True)
df_test['onpromotion'].fillna(False, inplace = True)
df_test['onpromotion'] = df_test['onpromotion'].map({'False': False, 'True': True})
df_test['onpromotion'] = df_test['onpromotion'].astype(bool)
df_test.describe(include = 'all')

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion
count,3370464.0,3370464,3370464.0,3370464.0,3370464
unique,,16,,,2
top,,2017-08-27 00:00:00,,,False
freq,,210654,,,3171867
first,,2017-08-16 00:00:00,,,
last,,2017-08-31 00:00:00,,,
mean,127182300.0,,27.5,1244798.0,
std,972969.3,,15.58579,589836.2,
min,125497000.0,,1.0,96995.0,
25%,126339700.0,,14.0,805321.0,


## Now that the data has been analyzed, we understand we have 4 years of training data to test on 2 weeks of data.
### Now what to infer from this?
### We should grab the most recent since data from 2013 is not as relevant. That doesn't mean it's not relevant, but it will be weighted much less. For the base case, we should focus only on the most recent,then narrow down to the most important peices

In [11]:
df_all.tail()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
125497035,125497035,2017-08-15,54,2089339,4.0,False
125497036,125497036,2017-08-15,54,2106464,1.0,True
125497037,125497037,2017-08-15,54,2110456,192.0,False
125497038,125497038,2017-08-15,54,2113914,198.0,True
125497039,125497039,2017-08-15,54,2116416,2.0,False


In [12]:
df_all = pd.read_feather('tmp/raw_groceries')

TypeError: read_feather() got an unexpected keyword argument 'nthreads'

In [13]:
df_all['unit_sales'] = np.log1p(np.clip(df_all['unit_sales'], 0, None))#the evaluation is RMSL+1E, since log(0) is not real.

### The competition says to count negatives in unit sales as 0, which is what np.cllip allows us to do, we set the minimum as 0, and put None as the maximum since there is no ceiling.

In [14]:
def add_datepart(df, fldname):
    fld= df[fldname] 
    #not df.fldname, that would just grab a field that states 'fldname'. also safer and should do more than this.
    targ_pre = re.sub('[Dd]ate$', '', fldname)#regex removes 'date' from fldname
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek','Dayofyear',#goes through each string
              'Is_month_end','Is_month_start', 'Is_quarter_end', 'Is_year_end', 'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
        #tries to find an attribute with the same name inside each object(like Year attr)
    df[targ_pre+'Elapsed'] = (fld - fld.min()).dt.days
    df.drop(fldname, axis=1, inplace=True)
        

In [15]:
%time add_datepart(df_all, 'date')

Wall time: 1min 37s


### no need for train_cats because everything is numeric already

In [21]:
def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and (max_n_cat is None or col.nunique()>max_n_cat):
        df[name] = col.cat.codes+1# if it isn't numeric, we replace with its codes.
    

In [22]:
def fix_missing(df, col, name):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum():
            df[name+'_na'] = pd.isnull(col)
        df[name] = col.fillna(col.median())# replaced with median, and replaces with new column telling what's missing

In [16]:
def proc_df(df, y_fld, skip_flds =None, do_scale = False,
           preproc_fn = None, max_n_cat = None, subset = None):
    if not skip_flds:#copies df, grabs y values, drops the original, and then does fix_missing
        skip_flds = []
    if subset:
        df= get_sample(df, subset)
    df = df.copy()
    if preproc_fn:
        preproc_fn(df)
    y = df[y_fld].values
    df.drop(skip_flds+[y_fld], axis=1, inplace =True)
    
    for n, c in df.items():
        fix_missing(df, c, n)
    if do_scale:
        mapper = scale_vars(df)
    for n, c in df.items():
        numericalize(df, c, n, max_n_cat)
    res = [pd.get_dummies(df, dummy_na = True), y]#discuss later
    if not do_scale: return res
    return res + [mapper]

In [17]:
def split_vals(a, n):
    return a[:n].copy(), a[n: ].copy()

In [18]:
n_valid = len(df_test)
n_trn = len(df_all)- n_valid
train, valid = split_vals(df_all, n_trn)
train.shape, valid.shape

((122126576, 17), (3370464, 17))

In [19]:
#If I needed to run train_cats:
#train_cats(raw_train)
#apply_cats(raw_valid, raw_train)

In [23]:
%%time
trn, y = proc_df(train, 'unit_sales')
val, y_val = proc_df(valid, 'unit_sales')

Wall time: 6min 42s


## 3 Models

In [24]:
def rmse(x,y):
    return math.sqrt(((x-y)**2).mean())
def print_score(m):
    res = [rmse(m.predict(X_train), Y_train), rmse(m.predict(X_valid), Y_valid),
          m.score(X_train, Y_train), m.score(X_valid, Y_valid)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)

In [26]:
def set_rf_samples(n):
    """ Changes Scikit learn's random forests to give each tree a random sample of
    n random rows.
    """
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n))


In [30]:
set_rf_samples(1_000_000)

### Instead, the trees are created from 1 million random pieces of data

In [31]:
%time x = np.array(trn, dtype=np.float32)

MemoryError: 

In [32]:
m = RandomForestRegressor(n_estimators = 20, min_samples_leaf=100, n_jobs = -1)
%time m.fit(x,y)

NameError: name 'x' is not defined