## Supervised learning ->Regression/Support vector Machine/Decission Tree
### Gabapathi Ekambaram 

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
%matplotlib inline

In [13]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }
df_sales = pd.read_csv('./646/sales-forecast/sell_prices.csv')
prices =pd.read_csv('./646/sales-forecast/sell_prices.csv', dtype = PRICE_DTYPES)
df_calendar = pd.read_csv('./646/sales-forecast/calendar.csv')
cal = pd.read_csv("./646/sales-forecast/calendar.csv", dtype = CAL_DTYPES)
df_sales_training = pd.read_csv('./646/sales-forecast/sales_train_validation.csv')

In [14]:
df_calendar.head()
df_sales_training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1919 entries, id to d_1913
dtypes: int64(1913), object(6)
memory usage: 446.4+ MB


In [15]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

pd.options.display.max_columns = 50
h = 28 
max_lags = 70
tr_last = 1913
fday = datetime(2016,4, 25) 


In [16]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    #prices =pd.read_csv('./646/sales-forecast/sell_prices.csv', dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    #cal = pd.read_csv("./646/sales-forecast/calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt =pd.read_csv("C:/Users/vivek/.conda/envs/646salesproject/data/sales-forecast/sales_train_validation.csv", nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt 

In [17]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [18]:
FIRST_DAY = 1200

In [19]:
%%time
df = create_dt(is_train=True, nrows =5000,first_day= FIRST_DAY)
df.shape
create_fea(df)
df.shape

Wall time: 18.2 s


(3486224, 31)

In [20]:
from sklearn.model_selection import train_test_split
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]

useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
df[:] = np.nan_to_num(df)
X_train = df[train_cols]
y_train = df['sales']
y_train.shape


(3486224,)

In [21]:
X_traindr, X_testdr, y_traindr, y_testdr = train_test_split(X_train, y_train, test_size=0.33, random_state=42)


In [23]:
from sklearn.impute import SimpleImputer
def replace_missing_value(df, number_features):
    imputer = SimpleImputer(strategy="median")
    df_num = df[number_features]
    imputer.fit(df_num)
    X = imputer.transform(df_num)
    res_def = pd.DataFrame(X, columns=df_num.columns)
    return res_def

In [24]:
from sklearn.model_selection import StratifiedKFold
X_trainnan =  replace_missing_value(X_traindr,cat_feats)
Y_trainnan =  y_traindr.fillna(0)
#predictors = X_trainnan.columns.values.tolist()[2:]

train_data = lgb.Dataset(X_traindr, label = y_traindr, categorical_feature=cat_feats, free_raw_data=False)
fake_valid_inds = np.random.choice(len(X_traindr), 1000000)
fake_valid_data = lgb.Dataset(X_traindr.iloc[fake_valid_inds], label = y_traindr.iloc[fake_valid_inds],categorical_feature=cat_feats,
                             free_raw_data=False)   # This is just a subsample of the training set, not a real validation set !

In [195]:
def series_to_supervised(data, window=1, lag=1, dropnan=True):
    cols, names = list(), list()
    # Input sequence (t-n, ... t-1)
    for i in range(window, 0, -1):
        cols.append(data.shift(i))
        names += [('%s(t-%d)' % (col, i)) for col in data.columns]
    # Current timestep (t=0)
    cols.append(data)
    names += [('%s(t)' % (col)) for col in data.columns]
    # Target timestep (t=lag)
    cols.append(data.shift(-lag))
    names += [('%s(t+%d)' % (col, lag)) for col in data.columns]
    # Put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # Drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [27]:
print(X_traindr)
print(y_traindr)

         item_id  dept_id  store_id  cat_id  state_id  wday  month  year  \
197640      1892        5         0       2         0     7      6  2014   
2084445      831        2         0       1         0     4      7  2015   
101832      2202        5         0       2         0     1      5  2014   
344564      1359        3         1       1         0     1      7  2014   
2285134     2009        3         1       1         0     3      8  2015   
...          ...      ...       ...     ...       ...   ...    ...   ...   
1692743     1261        3         1       1         0     1      4  2015   
2356330     1705        4         1       2         0     2      9  2015   
2229084     2019        3         0       1         0     2      8  2015   
2768307      772        2         1       1         0     1     11  2015   
2219110      157        0         0       0         0     3      8  2015   

         event_name_1  event_type_1  event_name_2  event_type_2  snap_CA  \
197640     

In [196]:
window = 29
lag = 1
series = series_to_supervised(X_trainnan,window=window, lag=lag)
series.head()

Unnamed: 0,item_id(t-29),dept_id(t-29),store_id(t-29),cat_id(t-29),state_id(t-29),event_name_1(t-29),event_name_2(t-29),event_type_1(t-29),event_type_2(t-29),item_id(t-28),dept_id(t-28),store_id(t-28),cat_id(t-28),state_id(t-28),event_name_1(t-28),event_name_2(t-28),event_type_1(t-28),event_type_2(t-28),item_id(t-27),dept_id(t-27),store_id(t-27),cat_id(t-27),state_id(t-27),event_name_1(t-27),event_name_2(t-27),...,store_id(t-1),cat_id(t-1),state_id(t-1),event_name_1(t-1),event_name_2(t-1),event_type_1(t-1),event_type_2(t-1),item_id(t),dept_id(t),store_id(t),cat_id(t),state_id(t),event_name_1(t),event_name_2(t),event_type_1(t),event_type_2(t),item_id(t+20),dept_id(t+20),store_id(t+20),cat_id(t+20),state_id(t+20),event_name_1(t+20),event_name_2(t+20),event_type_1(t+20),event_type_2(t+20)
29,1892.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,831.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2202.0,5.0,0.0,2.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,523.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2467.0,6.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
30,831.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2202.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1359.0,3.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,338.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,945.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
31,2202.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1359.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2009.0,3.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1416.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1510.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
32,1359.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2009.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1478.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2182.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,2009.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1478.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1627.0,4.0,0.0,2.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1645.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,528.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [200]:
labels_col = 'sales(t+%d)' % lag
labels = series[labels_col]
series = series.drop(labels_col, axis=1)
X_train, X_valid, Y_train, Y_valid = train_test_split(series, labels.values, test_size=0.4, random_state=0)

KeyError: 'sales(t+20)'

In [None]:
model_lstm = Sequential()
model_lstm.add(LSTM(50, activation='relu', input_shape=(X_train_series.shape[1], X_train_series.shape[2])))
model_lstm.add(Dense(1))
model_lstm.compile(loss='mse', optimizer=adam)
model_lstm.summary()

In [None]:
lstm_history = model_lstm.fit(X_train_series, Y_train, validation_data=(X_valid_series, Y_valid), epochs=epochs, verbose=2)

In [166]:
  param = {
        'num_leaves': (5, 20),
        'max_bin': 63,
        'min_data_in_leaf': (5, 20),
        'learning_rate': (0.01, 0.3),
        'min_sum_hessian_in_leaf':(0.00001, 0.01),
        'bagging_fraction': 1.0,
        'bagging_freq': 5,
        'feature_fraction': (0.05, 0.5),
        'lambda_l1': (0, 5.0),
        'lambda_l2': (0, 5.0),
        'min_gain_to_split': (0, 1.0),
        'max_depth': (3,15),
        'save_binary': True, 
        'seed': 1337,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'drop_seed': 1337,
        'data_random_seed': 1337,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': False, 
    }    

In [167]:
%%time

m_lgb = lgb.train(params,train_data,valid_sets = [fake_valid_data], verbose_eval=100)



[100]	valid_0's rmse: 2.33269
[200]	valid_0's rmse: 2.27768
[300]	valid_0's rmse: 2.24893
[400]	valid_0's rmse: 2.22733
[500]	valid_0's rmse: 2.20884
[600]	valid_0's rmse: 2.19269
[700]	valid_0's rmse: 2.17757
[800]	valid_0's rmse: 2.16672
[900]	valid_0's rmse: 2.15547
[1000]	valid_0's rmse: 2.1448
[1100]	valid_0's rmse: 2.13568
[1200]	valid_0's rmse: 2.12658
[1300]	valid_0's rmse: 2.11825
[1400]	valid_0's rmse: 2.11232
[1500]	valid_0's rmse: 2.10698
[1600]	valid_0's rmse: 2.10286
[1700]	valid_0's rmse: 2.09826
[1800]	valid_0's rmse: 2.09304
[1900]	valid_0's rmse: 2.08855
[2000]	valid_0's rmse: 2.08456
[2100]	valid_0's rmse: 2.08087
[2200]	valid_0's rmse: 2.07773
[2300]	valid_0's rmse: 2.07417
[2400]	valid_0's rmse: 2.06986
[2500]	valid_0's rmse: 2.06702
[2600]	valid_0's rmse: 2.06347
[2700]	valid_0's rmse: 2.06034
[2800]	valid_0's rmse: 2.05749
[2900]	valid_0's rmse: 2.05355
[3000]	valid_0's rmse: 2.05064
Wall time: 14min 18s


In [52]:
m_lgb.save_model("model200.lgb")

<lightgbm.basic.Booster at 0x1b2cccf4b20>

In [172]:
from sklearn import metrics
predictions = m_lgb.predict(X_testdr)   


NameError: name 'prediction' is not defined

In [174]:
score = metrics.roc_auc_score(X_trainnan,predictions)
print(score)

ValueError: multiclass-multioutput format is not supported

In [53]:
#Prediction
#y_predict=m_lgb.predict(X_testdr)
#test_data = lgb.Dataset(X_testdr, label = y_testdr, categorical_feature=cat_feats, free_raw_data=False)
from sklearn.metrics import accuracy_score, recall_score, precision_score 
#print('Accuracy score is: {}%'.format(np.round(accuracy_score(y_predict, y_testdr), 3) * 100))
#print('Recall score is: {}%'.format(np.round(recall_score(y_predict, y_testdr), 3) * 100))
#print('Precision score is: {}%'.format(np.round(precision_score(y_predict, y_testdr), 3) * 100))

In [54]:
from sklearn.impute import SimpleImputer
def replace_missing_value(df, number_features):

    imputer = SimpleImputer(strategy="median")
    df_num = df[number_features]
    imputer.fit(df_num)
    X = imputer.transform(df_num)
    res_def = pd.DataFrame(X, columns=df_num.columns)
    return res_def

<h4>DecisionTreeRegressor: Checking Accuracy </h4>

In [55]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

from sklearn.metrics import fbeta_score, make_scorer

from sklearn.metrics import r2_score, mean_squared_error, make_scorer

X_trainnan =  replace_missing_value(X_traindr,cat_feats)

dsr = DecisionTreeRegressor(random_state = 0, min_samples_split = 15,  max_depth = 10)
scores = cross_val_score(dsr, X_trainnan, y_traindr, cv = 15)
display(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
dsr.fit(X_trainnan, y_traindr)
pre_y_by_dsr = dsr.predict(X_trainnan)

array([0.2302149 , 0.25439868, 0.24296708, 0.23458013, 0.2479363 ,
       0.25096393, 0.22459944, 0.24480849, 0.22788208, 0.23638752,
       0.22265713, 0.23971071, 0.21959619, 0.24558649, 0.23327382])

Accuracy: 0.24 (+/- 0.02)


<h4>RandomForestRegressor : Checking Accuracy </h4>

In [25]:
rfr = RandomForestRegressor(n_estimators = 10)

scores = cross_val_score(rfr, X_trainnan, y_traindr, cv = 10)
display(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
rfr.fit(X_trainnan, y_traindr)
pre_y_by_rfr = rfr.predict(X_trainnan)

array([0.49620788, 0.50239638, 0.49854324, 0.50993515, 0.47877865,
       0.49307778, 0.47588261, 0.49088518, 0.50176263, 0.49662274])

Accuracy: 0.49 (+/- 0.02)


<h4>XGBoost: Checking Accuracy </h4>

In [95]:
import matplotlib.pyplot as plt
from pandas import read_csv
import math
# reshape into X=t and Y=t+1
import numpy as np
from pandas import read_csv
from pandas import datetime
from pandas import DataFrame
from pandas import concat
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


def timeseries_to_supervised(data, lag=1):
   df = DataFrame(data)
   columns = [df.shift(i) for i in range(1, lag+1)]
   columns.append(df)
   df = concat(columns, axis=1)
   df.fillna(0, inplace=True)
   return df


  from pandas import datetime


<h2>Model stacking </h2>