In [81]:
import numpy as np
import pandas as pd
import category_encoders as ce
from IPython.display import display, clear_output

import matplotlib.pyplot as plt
#import seaborn as sb

import time
import lightgbm as lgb
import sklearn as skl
from sklearn import svm
from sklearn import impute
from sklearn import pipeline
from sklearn import compose
from sklearn import metrics
from sklearn import model_selection
from sklearn import set_config
from sklearn import preprocessing

set_config(display='diagram') # Useful for display the pipeline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print("Pandas  ", pd.__version__)
print("Sklearn ", skl.__version__)
print("lightgbm", lgb.__version__)
print("Cat Enc.", ce.__version__)

Pandas   1.2.4
Sklearn  0.24.1
lightgbm 3.2.1
Cat Enc. 2.2.2


In [64]:
# import data in a DataFrame and print the first 5 rows
df = pd.read_csv('../../../predict-future-sales/sales_train.csv')

df.head(), df.tail()

(         date  date_block_num  shop_id  item_id  item_price  item_cnt_day
 0  02.01.2013               0       59    22154      999.00           1.0
 1  03.01.2013               0       25     2552      899.00           1.0
 2  05.01.2013               0       25     2552      899.00          -1.0
 3  06.01.2013               0       25     2554     1709.05           1.0
 4  15.01.2013               0       25     2555     1099.00           1.0,
                date  date_block_num  shop_id  item_id  item_price  \
 2935844  10.10.2015              33       25     7409       299.0   
 2935845  09.10.2015              33       25     7460       299.0   
 2935846  14.10.2015              33       25     7459       349.0   
 2935847  22.10.2015              33       25     7440       299.0   
 2935848  03.10.2015              33       25     7460       299.0   
 
          item_cnt_day  
 2935844           1.0  
 2935845           1.0  
 2935846           1.0  
 2935847           1.0  
 2

In [65]:
# print some info about the data

df.info(), df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date            object 
 1   date_block_num  int64  
 2   shop_id         int64  
 3   item_id         int64  
 4   item_price      float64
 5   item_cnt_day    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB


(None,
        date_block_num       shop_id       item_id    item_price  item_cnt_day
 count    2.935849e+06  2.935849e+06  2.935849e+06  2.935849e+06  2.935849e+06
 mean     1.456991e+01  3.300173e+01  1.019723e+04  8.908532e+02  1.242641e+00
 std      9.422988e+00  1.622697e+01  6.324297e+03  1.729800e+03  2.618834e+00
 min      0.000000e+00  0.000000e+00  0.000000e+00 -1.000000e+00 -2.200000e+01
 25%      7.000000e+00  2.200000e+01  4.476000e+03  2.490000e+02  1.000000e+00
 50%      1.400000e+01  3.100000e+01  9.343000e+03  3.990000e+02  1.000000e+00
 75%      2.300000e+01  4.700000e+01  1.568400e+04  9.990000e+02  1.000000e+00
 max      3.300000e+01  5.900000e+01  2.216900e+04  3.079800e+05  2.169000e+03)

In [66]:
# check if some data is missing

df.isna().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

In [95]:
df_group = df.groupby(by=['date_block_num', 'item_id']).sum()

In [67]:
# define x_train and y_train

x_train = df.drop(labels=['date', 'item_cnt_day'], axis=1)
y_train = df['item_cnt_day']
x_train, y_train

# load x_test

x_test = pd.read_csv('../../../predict-future-sales/test.csv')

In [68]:
x_train['month'] = x_train['date_block_num'] % 12

x_train = x_train.drop(labels='date_block_num', axis=1)

x_train.head()

Unnamed: 0,shop_id,item_id,item_price,month
0,59,22154,999.0,0
1,25,2552,899.0,0
2,25,2552,899.0,0
3,25,2554,1709.05,0
4,25,2555,1099.0,0


In [69]:
# month_abbr = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# x['month'] = [month_abbr[x.month[i]] for i in range(x.shape[0])]

Unnamed: 0,date_block_num,shop_id,item_id,item_price,month
0,0,59,22154,999.0,0
1,0,25,2552,899.0,0
2,0,25,2552,899.0,0
3,0,25,2554,1709.05,0
4,0,25,2555,1099.0,0


In [70]:
cat_vars = ['shop_id', 'item_id', 'month']
num_vars = ['item_price']



In [77]:
# Tree models preprocessing pipes
# outliers_fraction = 0.15
# num_4_treeModels = pipeline.Pipeline(steps=[
#     # Outliers detection
#     ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)),
# ])

cat_4_treeModels = pipeline.Pipeline(steps=[
  # Some Encoder here. Remember to handle_unknown
  ('ordinal', preprocessing.OrdinalEncoder(categories='auto', handle_unknown='use_encoded_value', unknown_value=-9999))
])

tree_prepro = compose.ColumnTransformer(transformers=[
    # ('num_t', num_4_treeModels, num_vars),
    ('cat_t', cat_4_treeModels, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

tree_prepro

In [78]:
# YOUR CODE HERE
num_4_multModels = pipeline.Pipeline(steps=[
    # # Outliers detection
    # ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)),
    # Norm
    ('quant', preprocessing.QuantileTransformer(output_distribution='normal', random_state=73)),
])

cat_4_multModels = pipeline.Pipeline(steps=[
  # Some Encoder here. Remember to handle_unknown
  ('onehot', preprocessing.OneHotEncoder(categories='auto', handle_unknown='ignore')),
])

mult_prepro = compose.ColumnTransformer(transformers=[
    ('num_m', num_4_multModels, num_vars),
    ('cat_m', cat_4_multModels, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

mult_prepro

In [79]:
# YOUR CODE HERE
# Tree models
from sklearn.tree           import DecisionTreeRegressor
from sklearn.ensemble       import RandomForestRegressor
from sklearn.ensemble       import ExtraTreesRegressor
from sklearn.ensemble       import AdaBoostRegressor
from sklearn.ensemble       import GradientBoostingRegressor
from sklearn.experimental   import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble       import HistGradientBoostingRegressor
from xgboost                import XGBRegressor
from lightgbm               import LGBMRegressor
from catboost               import CatBoostRegressor

# Multiplicative models
from sklearn.svm            import SVR
from sklearn.linear_model   import Lasso
from sklearn.linear_model   import SGDRegressor
from sklearn.linear_model   import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors      import KNeighborsRegressor

tree_regressors = {
  "Decision Tree": DecisionTreeRegressor(random_state=0),
  "Extra Trees": ExtraTreesRegressor(n_estimators=200),
  "Random Forest": RandomForestRegressor(n_estimators=200),
  "AdaBoost": AdaBoostRegressor(n_estimators=200),
  "Skl GBM": GradientBoostingRegressor(n_estimators=200),
  "Skl HistGBM": HistGradientBoostingRegressor(random_state=0),
  "XGBoost": XGBRegressor(random_state=0),
  "LightGBM": LGBMRegressor(random_state=0),
  "CatBoost":CatBoostRegressor(random_state=0),
  }

mult_regressors = {
  "SVR": SVR(),
  "Lasso": Lasso(),
  "SDG regr": SGDRegressor(),
  "Neural regr": MLPRegressor(random_state=73),
  "KNN regr": KNeighborsRegressor(n_neighbors=10),
  "Ridge": Ridge(alpha=0.6, random_state=73)
}


tree_regressors = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_regressors.items()}

mult_regressors = {name: pipeline.make_pipeline(mult_prepro, model) for name, model in mult_regressors.items()}

all_pipelines = {**tree_regressors, **mult_regressors}

In [82]:
x_train_1, x_valid, y_train_1, y_valid = model_selection.train_test_split(x_train, y_train, test_size=0.4, random_state=0)

results = pd.DataFrame({'Model': [], 'MSE': [], 'RMSE': [], 'Time': []})

for name, pipe in all_pipelines.items():

    start_time = time.time()
    preds = 0
    # YOUR CODE HERE
    pipe.fit(x_train_1, y_train_1)

    preds = pipe.predict(x_valid)

    mse = sum((y_valid - preds)**2) / preds.shape[0]

    rmse = np.sqrt(mse)

    total_time = time.time() - start_time
    

    
    results = results.append({"Model": name,
                              "MSE":   mse,
                              "RMSE":  rmse,
                              "Time":  total_time},
                              ignore_index=True)
    
    
    results_ord = results.sort_values(by=['RMSE'], ascending=True, ignore_index=True)
    results_ord.index += 1 
    clear_output()
    display(results_ord.style.bar(subset=['MSE', 'RMSE'], vmin=0, color='#5fba7d'))

Unnamed: 0,Model,MSE,RMSE,Time
1,Decision Tree,9.45877,3.075511,6.535294


KeyboardInterrupt: 