<a href="https://colab.research.google.com/github/YanivRefaelovich/ML-Projects/blob/master/Predict%20Sales/Predict_Future_Sales.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('drive/My Drive/Predict Future Sales')

print(os.listdir())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['Data', 'Predict Future Sales.ipynb', 'Predictions', 'Submissions', 'Predict Future Sales_Stacking.ipynb']


In [0]:
!pip install googletrans;






---


**Plan**
1. Read all csv files
2. describe all information
3. translate categories and shops names
4. extract cities names
5. combine similar categories
6. remove outliers 
7. agregate sales over days
8. transform date as year and month
8. integrate data- shop_id, item_id, city, category, price, item_cnt- per month
9. remove data that is not in test
10. split data into sub periods of 6 months
11. calculate additional features
12. train
13. test


---






In [0]:
import os
import pandas as pd
from googletrans import Translator
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import timeit
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import math
from keras.models import Sequential
from keras.layers import Dense, Dropout 
from keras.callbacks import Callback, EarlyStopping
from keras.regularizers import l2
from keras import optimizers
from sklearn.model_selection import KFold, train_test_split
import gc
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping
import xgboost as xgb
from xgboost import plot_importance

Using TensorFlow backend.


In [0]:
# read all necessary files

items = pd.read_csv('Data/items.csv')
sample_submission = pd.read_csv('Data/sample_submission.csv')
shops = pd.read_csv('Data/shops.csv')
sales_train = pd.read_csv('Data/sales_train_v2.csv')
item_categories = pd.read_csv('Data/item_categories.csv')
test_csv = pd.read_csv('Data/test.csv')

In [0]:
print('items:', items.shape)
print('shops:', shops.shape)
print('sales_train:', sales_train.shape)
print('item_categories:', item_categories.shape)
print('test:', test_csv.shape)

items: (22170, 3)
shops: (60, 2)
sales_train: (2935849, 6)
item_categories: (84, 2)
test: (214200, 3)


In [0]:
def basic_eda(df):
    print("-----------HEAD----------------")
    print(df.head(5))
    print("----------INFO-----------------")
    print(df.info())
    print("----------Describe-------------")
    print(df.describe())
    print("----------Columns--------------")
    print(df.columns)
    print("----------Data Types-----------")
    print(df.dtypes)
    print("-------Missing Values----------")
    print(df.isnull().sum())
    print("-------NULL values-------------")
    print(df.isna().sum())
    print("-----Shape Of Data-------------")
    print(df.shape)

In [0]:
print('============sales_train===============')
basic_eda(sales_train)
print('============items===============')
basic_eda(items)
print('============shops===============')
basic_eda(shops)
print('============item_categories===============')
basic_eda(item_categories)
print("==============test=========================")
basic_eda(test_csv)



-----------HEAD----------------
         date  date_block_num  shop_id  item_id  item_price  item_cnt_day
0  02.01.2013               0       59    22154      999.00           1.0
1  03.01.2013               0       25     2552      899.00           1.0
2  05.01.2013               0       25     2552      899.00          -1.0
3  06.01.2013               0       25     2554     1709.05           1.0
4  15.01.2013               0       25     2555     1099.00           1.0
----------INFO-----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
date              object
date_block_num    int64
shop_id           int64
item_id           int64
item_price        float64
item_cnt_day      float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB
None
----------Describe-------------
       date_block_num       shop_id       item_id    item_price  item_cnt_day
count    2.935849e+06  2.935849e+06  2.935849e+06  2

# Preprocessing

In [0]:
# compare test ad train items and shops

train_item_id = sales_train['item_id'].unique()
test_item_id = test_csv['item_id'].unique()

train_shop_id = sales_train['shop_id'].unique()
test_shop_id = test_csv['shop_id'].unique()

print(test_item_id.shape)
print(test_csv.shape)
print(np.sum(np.isin(test_item_id, train_item_id)==False))
print(np.sum(np.isin(test_shop_id, train_shop_id)==True))

(5100,)
(214200, 3)
363
42


In [0]:
# remove outliers
outliers = np.where((sales_train['item_price'] > np.percentile(sales_train['item_price'], 95)) | 
                    (sales_train['item_cnt_day'] > np.percentile(sales_train['item_cnt_day'], 95))|
                    (sales_train['item_cnt_day']<0))[0]


sales_train.drop(outliers, axis=0, inplace=True)
sales_train.reset_index(inplace=True)

print(outliers.shape,sales_train.shape)



(248793,) (2687056, 7)


In [0]:
# translating items_category

translator = Translator()
item_categories_trans = item_categories.copy()
for i in range(item_categories.shape[0]):
    a = translator.translate(item_categories.iloc[i,0], src='ru', dest='en')
    item_categories_trans.iloc[i, 0] = a.text


In [0]:
item_categories_trans['item_category_name'].loc[item_categories_trans['item_category_name'].str.contains('Consoles ', case=False, regex=False)] = 'Consoles'
item_categories_trans['item_category_name'].loc[item_categories_trans['item_category_name'].str.contains('Books ', case=False, regex=False)] = 'Books'
item_categories_trans['item_category_name'].loc[item_categories_trans['item_category_name'].str.contains('Cinema ', case=False, regex=False)] = 'Cinema'
item_categories_trans['item_category_name'].loc[item_categories_trans['item_category_name'].str.contains('Music ', case=False, regex=False)] = 'Music'
item_categories_trans['item_category_name'].loc[item_categories_trans['item_category_name'].str.contains('Gifts ', case=False, regex=False)] = 'Gifts'
item_categories_trans['item_category_name'].loc[item_categories_trans['item_category_name'].str.contains('Programs ', case=False, regex=False)] = 'Programs'
item_categories_trans['item_category_name'].loc[item_categories_trans['item_category_name'].str.contains('Clean media ', case=False, regex=False)] = 'media'
item_categories_trans['item_category_name'].loc[item_categories_trans['item_category_name'].str.contains('Payment ', case=False, regex=False)] = 'Payment'
item_categories_trans['item_category_name'].loc[item_categories_trans['item_category_name'].str.contains('Accessories ', case=False, regex=False)] = 'Accessories'
item_categories_trans['item_category_name'].loc[item_categories_trans['item_category_name'].str.contains('Games ', case=False, regex=False)] = 'Game'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [0]:
print(item_categories_trans.head())


           item_category_name  item_category_id
0  PC - Headsets / Headphones                 0
1                 Accessories                 1
2                 Accessories                 2
3                 Accessories                 3
4                 Accessories                 4


In [0]:
# translating shops names and extructing cities

shops_trans = shops.copy()
shops_trans['city'] = 0
for i in range(shops.shape[0]):
    a=translator.translate(shops.iloc[i, 0], src='ru', dest='en')
    shops_trans.loc[i, 'shop_name'] = a.text
    shops_trans.loc[i, 'city'] = a.text.split(' ')[0]
#     print(a.text, '|  city: ', a.text.split(' ')[0])

In [0]:
# number of month
print('Number of month:', sales_train["date_block_num"].unique().size)
print('Number of unique items:', sales_train["item_id"].unique().size)
print('Number of unique shops:', sales_train["shop_id"].unique().size)
print('Number of unique cities:', shops_trans["city"].unique().size)

Number of month: 34
Number of unique items: 21066
Number of unique shops: 60
Number of unique cities: 32


In [0]:
monthly_data = sales_train.pivot_table(index=['shop_id', 'item_id'], columns='date_block_num',
                            values=['item_cnt_day','item_price'], fill_value=0,
                              aggfunc={'item_cnt_day': 'sum' , 'item_price':'mean'}).reset_index()

In [0]:
# adding cities and categories
monthly_data['city'] = monthly_data['shop_id']
monthly_data['category'] = monthly_data['item_id']

monthly_data['city'] = monthly_data['city'].replace(to_replace=shops_trans['shop_id'].values,
                                                   value=shops_trans['city'].values)

monthly_data['category'] = monthly_data['category'].replace(to_replace=items['item_id'].values,
                                                   value=items['item_category_id'].values)

monthly_data['category'] = monthly_data['category'].replace(to_replace=item_categories_trans['item_category_id'].values,
                                                   value=item_categories_trans['item_category_name'].values)


In [0]:
# fixing price and clipping sales
monthly_data ['item_price'] = monthly_data['item_price'].replace(to_replace=0, value=np.nan)
monthly_data['item_cnt_day'] = monthly_data['item_cnt_day'].clip(lower=0, upper=20)


## feature calculation

In [0]:
# Calculating features for training

cnt_data = monthly_data['item_cnt_day']
price_data = monthly_data['item_price']
max_price = monthly_data['item_price'].max(axis=1)
avg_price = price_data.mean(axis=1, skipna=True)
std_price = price_data.std(axis=1, skipna=True)

le_cat = LabelEncoder()
le_city = LabelEncoder()

cat = le_cat.fit_transform(monthly_data['category'])
city = le_city.fit_transform(monthly_data['city'])

cat = pd.DataFrame(data=cat,  columns=['cat'])
city = pd.DataFrame(data=city,  columns=['city'])

# cat = pd.get_dummies(monthly_data['category'])
# city = pd.get_dummies(monthly_data['city'])

num_month_back = 6
Features = pd.DataFrame()
Target = pd.DataFrame();

X = pd.DataFrame()

for i in range(cnt_data.shape[1] - num_month_back):
    
    # data over 6 months
    
    current_price_mean = price_data.iloc[:, i:i+num_month_back].mean(axis=1, skipna=True)
    current_price_std = price_data.iloc[:, i:i+num_month_back].std(axis=1, skipna=True)
    current_price_last = price_data.iloc[:, i+num_month_back-1]
    
    # dealing with null prices
    current_price_mean.loc[current_price_mean.isnull()] = avg_price.loc[current_price_mean.isnull()]
    current_price_std.loc[current_price_std.isnull()] = std_price.loc[current_price_std.isnull()]
    current_price_last.loc[current_price_last.isnull()] = avg_price.loc[current_price_last.isnull()]

    current_sales = cnt_data.iloc[:, i:i+num_month_back]
    current_sales_mean = cnt_data.iloc[:, i:i+num_month_back].mean(axis=1)
    current_sales_diff = cnt_data.iloc[:, i:i+num_month_back].max(axis=1) - cnt_data.iloc[:, i:i+num_month_back].min(axis=1)
    current_sales_std = cnt_data.iloc[:, i:i+num_month_back].std(axis=1)

    current_sales_1st_half = cnt_data.iloc[:, i:i+int(num_month_back/2)].mean(axis=1)
    current_sales_2nd_half = cnt_data.iloc[:, i+int(num_month_back/2):i+num_month_back].mean(axis=1)

    year = np.floor((i+num_month_back)/12)
    month = (i+num_month_back)%12 + 1

    features = pd.concat([current_sales,
                          current_sales_mean,
                          current_sales_diff,
                          current_sales_std,
                          current_sales_1st_half,
                          current_sales_2nd_half,
                          current_price_mean,
#                           current_price_std,
                          current_price_last,
                          cat,
                          city,
                         ], axis=1, ignore_index=True)
    
    features['month'] = month
    features['year'] = year
    
    X = pd.concat([X, features],axis=0, ignore_index=True)
    
    # sales in the following month
    Target = pd.concat([Target, cnt_data.iloc[:, i+num_month_back]], axis=0, ignore_index=True)
    

In [0]:
columns_names = [str(i) for i in range(num_month_back)]
columns_names.extend(['sales_mean','sales_diff','sales_std','sales_beginning','sales_end','price_mean','last_price','category','city','month','year'])
X.columns = columns_names
X.head()

Unnamed: 0,0,1,2,3,4,5,sales_mean,sales_diff,sales_std,sales_beginning,sales_end,price_mean,last_price,category,city,month,year
0,0,8,0,0,0,0,1.333333,8,3.265986,2.666667,0.0,265.0,265.0,4,0,7,0.0
1,0,5,0,0,0,0,0.833333,5,2.041241,1.666667,0.0,434.0,434.0,4,0,7,0.0
2,6,10,0,0,0,0,2.666667,10,4.320494,5.333333,0.0,221.0,221.0,4,0,7,0.0
3,3,3,0,0,0,0,1.0,3,1.549193,2.0,0.0,347.0,347.0,4,0,7,0.0
4,1,14,0,0,0,0,2.5,14,5.648008,5.0,0.0,247.0,247.0,4,0,7,0.0


In [0]:
# splitting to train and validation taking the last month as validation

val_inds = np.where((X['year']==2) & (X['month']==10))[0]
train_inds = np.setdiff1d(np.arange(0, X.shape[0]), val_inds)
train_inds = np.arange(0, X.shape[0])

In [0]:
X_train = X.iloc[train_inds, :]
y_train = Target.iloc[train_inds, 0]
# X_train = X.copy()
# y_train = Target.copy()

X_val = X.iloc[val_inds, :]
y_val = Target.iloc[val_inds, 0]


# X_train_base, X_train_meta, y_train_base, y_train_meta = train_test_split(X, Target, test_size=0.33, random_state=42)

# print('X_train_base: ',X_train_base.shape, 'y_train_base: ', y_train_base.shape, '\nX_train_meta: ', X_train_meta.shape, 'y_train_meta: ',y_train_meta.shape, '\nX_val: ',X_val.shape, 'y_val: ', y_val.shape)

In [0]:
gc.collect()

82

In [0]:
del sales_train,item_categories, item_categories_trans, shops, shops_trans


In [0]:
gc.collect();

In [0]:
# preparing test data for predicting

i = cnt_data.shape[1] - num_month_back
    
# data over last 12 months for testing

current_price_mean = price_data.iloc[:, i:i+num_month_back].mean(axis=1, skipna=True)
current_price_std = price_data.iloc[:, i:i+num_month_back].std(axis=1, skipna=True)
current_price_last = price_data.iloc[:, i+num_month_back-1]

# dealing with null prices
current_price_mean.loc[current_price_mean.isnull()] = avg_price.loc[current_price_mean.isnull()]
current_price_std.loc[current_price_std.isnull()] = std_price.loc[current_price_std.isnull()]
current_price_last.loc[current_price_last.isnull()] = avg_price.loc[current_price_last.isnull()]

current_sales = cnt_data.iloc[:, i:i+num_month_back]
current_sales_mean = cnt_data.iloc[:, i:i+num_month_back].mean(axis=1)
current_sales_diff = cnt_data.iloc[:, i:i+num_month_back].max(axis=1) - cnt_data.iloc[:, i:i+num_month_back].min(axis=1)
current_sales_1st_half = cnt_data.iloc[:, i:i+int(num_month_back/2)].mean(axis=1)
current_sales_2nd_half = cnt_data.iloc[:, i+int(num_month_back/2):i+num_month_back].mean(axis=1)
current_sales_std = cnt_data.iloc[:, i:i+num_month_back].std(axis=1)

year = np.floor((i+num_month_back)/12)
month = (i+num_month_back)%12 + 1

X_test = pd.concat([current_sales,
                      current_sales_mean,
                      current_sales_diff,
                      current_sales_std,
                      current_sales_1st_half,
                      current_sales_2nd_half,
                      current_price_mean,
#                           current_price_std,
                      current_price_last,
                      cat,
                      city,
                     ], axis=1, ignore_index=True)

X_test['month'] = month
X_test['year'] = year

X_test.columns=columns_names



In [0]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,sales_mean,sales_diff,sales_std,sales_beginning,sales_end,price_mean,last_price,category,city,month,year
0,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,265.0,265.0,4,0,11,2.0
1,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,434.0,434.0,4,0,11,2.0
2,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,221.0,221.0,4,0,11,2.0
3,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,347.0,347.0,4,0,11,2.0
4,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,247.0,247.0,4,0,11,2.0


In [0]:
# decribe y_train and y_val

print(np.percentile(y_train, [5, 25, 50, 75, 95, 99, 100]))
print(np.percentile(y_val, [5, 25, 50, 75, 95, 99, 100]))


[ 0.  0.  0.  0.  1.  4. 20.]
[ 0.  0.  0.  0.  1.  2. 20.]


# TRAINING ANN

In [0]:
gc.collect()
X_train_NN = X_train.drop(['category','city'], axis=1)
X_val_NN = X_val.drop(['category','city'], axis=1)

X_test_NN = X_test.drop(['category','city'], axis=1)

scaler_NN = MinMaxScaler()
scaler_NN.fit(X_train_NN)

X_train_NN_norm = scaler_NN.transform(X_train_NN)
X_val_NN_norm = scaler_NN.transform(X_val_NN)
X_test_NN_norm = scaler_NN.transform(X_test_NN)

print(X_train_NN.shape)

(11417532, 15)


In [0]:
gc.collect();
model = Sequential()

model.add(Dense(10, input_dim =X_train_NN.shape[1], activation='relu', activity_regularizer=l2(0.01)))
model.add(Dense(8, activation='relu', activity_regularizer=l2(0.00)))
model.add(Dense(1, activation='relu', activity_regularizer=l2(0.00)))

es = EarlyStopping(monitor='val_loss', min_delta=0.002, patience=2, verbose=1, mode='min', restore_best_weights=True)

# opt = optimizers.SGD(lr=0.01, momentum=0.5)
model.compile(loss='mean_squared_error', optimizer='sgd')
initial_weights = model.get_weights()

y_pred_ann = np.zeros((X_test_NN_norm.shape[0]))

score = []

K=5
KF = KFold(n_splits=K, random_state=0)

for k_train_inds, k_test_inds in KF.split(X_train_NN_norm, y_train):

    # training ANN
    model.set_weights(initial_weights)
    model.fit(X_train_NN_norm[k_train_inds, :] , y_train.iloc[k_train_inds],
              epochs=3,
              batch_size=256,
              verbose=1, 
              validation_data=(X_train_NN_norm[k_test_inds, :] , y_train.iloc[k_test_inds]),
              callbacks=[es])

    # predicting both
    y_pred = np.array([i[0] for i in model.predict(X_test_NN_norm)])

    # calculate validation score
    val_score_ann = np.sqrt(model.evaluate(X_train_NN_norm[k_test_inds, :] , y_train.iloc[k_test_inds]))

    score.append([val_score_ann])

    # predicting test 
    y_pred_ann += y_pred/K
    break
    

print(score)
pd.DataFrame(data=y_pred_ann,columns=['y_pred_ann']).to_csv('Submissions/prediction_ann.csv', index=False)

In [0]:
gc.collect();

# TRAININ RANDOM F0RE5T

In [0]:
forest = RandomForestRegressor(max_depth=5, random_state=0, n_estimators=50, verbose=2, n_jobs=-1,
                              min_samples_split= 50, max_features='log2', bootstrap=True)


y_pred_forest=np.zeros((X_test.shape[0]))

score = []
K=5
KF = KFold(n_splits=K, random_state=0)

for k_train_inds, k_test_inds in KF.split(X_train, y_train):
   
    # trainin Random Forest
    forest.fit(X_train.iloc[k_train_inds,:], y_train.iloc[k_train_inds])

    y_pred = forest.predict(X_test)

    # calculate validation score
    val_score_forest = np.sqrt(np.mean((forest.predict(X_train.iloc[k_test_inds,:]) - y_train.iloc[k_test_inds].values)**2))
    train_score_forest = np.sqrt(np.mean((forest.predict(X_train.iloc[k_train_inds,:]) - y_train.iloc[k_train_inds].values)**2))

    score.append([val_score_forest])
    print(train_score_forest, train_score_forest)

    # predicting test 
    y_pred_forest += y_pred/K
    
    break
pd.DataFrame(data=y_pred_forest).to_csv('prediction_forest.csv', index=False)    

# TRAINING XGBoost

In [0]:
gc.collect();

params = {'max_depth':5,
          'learning_rate':0.05,
          'verbosity':0,
          'objective':'reg:squarederror',
          'n_estimators':100,
          'n_jobs':-1,
          'reg_alpha': 1.5,
          'reg_lambda': 1.5,
          'gamma': 0.8}

K=5
KF = KFold(n_splits=K, shuffle = True, random_state=1)

y_pred_xg = np.zeros((X_test.shape[0]))
for k_train_inds, k_test_inds in KF.split(X_train, y_train):

    xg = xgb.XGBRegressor(**params)
    xg.fit(X_train.iloc[k_train_inds, :], y_train.iloc[k_train_inds], 
       eval_set=[(X_train.iloc[k_train_inds, :], y_train.iloc[k_train_inds]), (X_train.iloc[k_test_inds, :], y_train.iloc[k_test_inds])],
       eval_metric='rmse')

    y_pred = xg.predict(X_test)
    y_pred_xg+=y_pred/K
        
pd.DataFrame(data=y_pred_xg).to_csv('Submissions/prediction_xgb.csv',index=False)    

  if getattr(data, 'base', None) is not None and \


[0]	validation_0-rmse:0.899226	validation_1-rmse:0.899023
[1]	validation_0-rmse:0.878585	validation_1-rmse:0.880896
[2]	validation_0-rmse:0.864431	validation_1-rmse:0.862397
[3]	validation_0-rmse:0.83802	validation_1-rmse:0.848182
[4]	validation_0-rmse:0.825758	validation_1-rmse:0.831697
[5]	validation_0-rmse:0.80724	validation_1-rmse:0.820339
[6]	validation_0-rmse:0.795804	validation_1-rmse:0.8065
[7]	validation_0-rmse:0.788305	validation_1-rmse:0.795244
[8]	validation_0-rmse:0.780848	validation_1-rmse:0.785392
[9]	validation_0-rmse:0.772245	validation_1-rmse:0.774802
[10]	validation_0-rmse:0.765204	validation_1-rmse:0.765637
[11]	validation_0-rmse:0.753467	validation_1-rmse:0.758743
[12]	validation_0-rmse:0.744488	validation_1-rmse:0.751147
[13]	validation_0-rmse:0.738484	validation_1-rmse:0.743938
[14]	validation_0-rmse:0.732325	validation_1-rmse:0.737051
[15]	validation_0-rmse:0.726883	validation_1-rmse:0.731962
[16]	validation_0-rmse:0.720816	validation_1-rmse:0.726668
[17]	valida

In [0]:
plot_importance(xg)

In [0]:
gc.collect()
y_pred_test = xg.predict(X_test)
y_pred_val = xg.predict(X_val)

print(np.percentile(y_val, [5, 25, 50, 75, 95, 99, 99.9, 100]))
print(np.percentile(y_pred_test, [5, 25, 50, 75, 95, 99, 99.9, 100]))
print(np.percentile(y_pred_val, [5, 25, 50, 75, 95, 99, 99.9, 100]))

# PREPARING RESULTS FOR SUBMISSION


In [0]:
avg_pred =  y_pred_forest # y_pred_xg/2  + y_pred_ann/2

y_pred1 = pd.DataFrame({'item_id': monthly_data['item_id'], 'shop_id': monthly_data['shop_id'],
                      'item_cnt_month': np.squeeze(avg_pred)}, index=range(monthly_data.shape[0]))

test_output = pd.merge(test_csv, y_pred1, on=['shop_id','item_id'], how='left')

test_output.drop(['item_id', 'shop_id'], axis=1, inplace=True)

test_output['item_cnt_month'].replace(np.nan, 0, inplace=True)
test_output.to_csv('Submissions/submission25_forest.csv',index=False)
test_output.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.583197
1,1,0.0
2,2,1.023322
3,3,0.130349
4,4,0.0


**Visualization of results**

In [0]:
y_pred_xg = xg.predict(X_val);
error = np.abs(y_pred_xg- y_val)

max_error_inds = np.where(error > np.percentile(error,99.5))[0]
min_error_inds = np.where(error < np.percentile(error, 5))[0]

for i in range(10):
    
    plt.figure()
    plt.plot(X_val.iloc[max_error_inds[i], 0:num_month_back ], marker='v', linestyle='dashed', linewidth=2, markersize=8)
    plt.plot(num_month_back, y_val.iloc[max_error_inds[i]],'p')
    plt.plot(num_month_back, y_pred_xg[max_error_inds[i]],'*')

    plt.legend(['Sales','True','Xg'])
    plt.xlabel('Month')

for i in range(10):
    
    plt.figure()
    plt.plot(X_val.iloc[min_error_inds[i], 0:num_month_back ], marker='v', linestyle='dashed', linewidth=2, markersize=8)
    plt.plot(num_month_back, y_val.iloc[min_error_inds[i]],'p')
    plt.plot(num_month_back, y_pred_xg[min_error_inds[i]],'*')
    plt.legend(['Sales','True','Xg',])
    plt.xlabel('Month')