# Problem Details

>  Assume your avocado data collection is on a 1 month delay. When predicting the price of an avocado at time x, you can only use historical data from up until 1 month prior to x)

> **Setup** : We need to bui;d dataset from the given to perform this type of modeling. A model should be trained on records which uses features based on historical data one month prior to current time. In order to create that dataset, from the original dataset, grouped by region and type shift all the numerical columns by 5 records(5 weeks, approx month). Now each record has feature values only of a month prior and at any point in time only information prior to month is being used

>**Features**: Model(XGBoost) conventional and organic separartely, 
>> * Label encode region as numerical variable. 
>> * Add lags of price values from month prior
>> * Add long term and short term moving average to capture trend
>> * Add month  and day variable from date to capture seasonality

>**Evaluation**: Time series cross validation, split data into train and test using the date variable. Use RMSE to evaluate. Used gridsearch to get the parameter values for the xgboost model


# Import Packages

In [None]:
# import packages for data manipulation
import pandas as pd
import numpy as np

# Plotting libraries
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import janitor
import pickle

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import GroupKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_squared_log_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

In [None]:
avocado_type = 'conventional'

In [None]:
df = pd.read_csv("avocado.csv")
df.shape

In [None]:
# Removing index column
df.drop('Unnamed: 0', axis=1, inplace=True)

# Removing records with TotalUS region, assuming it is nust the average of all other regions
df = df.loc[df.region!='TotalUS'].reset_index(drop=True)

# Making date to datetime and sorting chrinologically
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['region','Date'])
df = df.clean_names()

In [None]:
df.date.describe()

In [None]:
future_dates = ['2018-04-01','2018-04-08','2018-04-15','2018-04-22']
regions = list(set(df.region))
types = list(set(df.type))
from itertools import product
future_df = pd.DataFrame(list(product(future_dates, regions, types)), columns=['date', 'region', 'type'])
future_df.date = pd.to_datetime(future_df.date)

In [None]:
df = df.append(future_df)

# Dataset Creation - One month Lag

In [None]:
df_list = []
for avocado_type in types:
    for region in regions:
        #print(region)
        temp = df.loc[(df.region==region)&(df.type==avocado_type)].sort_values('date').reset_index(drop=True)
        for col in ['total_volume','4046','4225','4770','total_bags','small_bags','large_bags','xlarge_bags','averageprice']:
            temp[f'one_month_lag_{col}'] = temp[col].shift(5)
            if col!='averageprice':
                temp.drop(col,axis=1,inplace=True)
        temp = temp.loc[temp.one_month_lag_total_volume.notnull()].reset_index(drop=True)
        #print(temp.shape)
        df_list.append(temp)

In [None]:
final_train = pd.concat(df_list)
final_train.shape

In [None]:
pickle.dump(final_train,open('data_with_one_month_lag.p','wb'))

In [None]:
final_train = final_train.loc[final_train.type==avocado_type]
final_train = final_train.sort_values(['date']).reset_index(drop=True)
final_train.head()

# Preprocessing and Feature Engineering

In [None]:
# Adding month and day variable to visualize seasonal patterns
final_train['month']=final_train['date'].apply(lambda x:x.month)
final_train['day']=final_train['date'].apply(lambda x:x.day)

for lag in range(1,4):
    final_train[f'one_month_lag_lag_{lag}'] = final_train.groupby(['region','type'])['one_month_lag_averageprice'].shift(lag)
    
final_train['long_term_moving_average'] = final_train.groupby(['region','type'])['one_month_lag_averageprice'].transform(lambda x: x.rolling(window=52,min_periods=1).mean())
final_train['short_term_moving_average'] = final_train.groupby(['region','type'])['one_month_lag_averageprice'].transform(lambda x: x.rolling(window=12,min_periods=1).mean())
final_train['is_SMA_greater'] = (final_train['short_term_moving_average'] > final_train['long_term_moving_average'])
#for lag in range(1,11):
    #final_train[f'volume_lag_{lag}'] = final_train.groupby(['region','type'])['one_month_lag_total_volume'].shift(lag)

In [None]:
test = final_train.loc[(final_train.date>='2018-01-01')&(final_train.date<'2018-04-01')].reset_index(drop=True)
train = final_train.loc[final_train.date<'2018-01-01'].reset_index(drop=True)
future = final_train.loc[final_train.date>='2018-04-01'].reset_index(drop=True)
#train = final_train.copy()

In [None]:
def preprocessing(df, train=True):
    
    num_columns = ['one_month_lag_total_volume','one_month_lag_4046','one_month_lag_4225',
                   'one_month_lag_4770','one_month_lag_total_bags','one_month_lag_small_bags', 
                   'one_month_lag_large_bags', 'one_month_lag_xlarge_bags']

    if train:
        sc = StandardScaler()
        scaled_columns = sc.fit_transform(df[num_columns])
        scaled_df = pd.DataFrame(scaled_columns)
        scaled_df.columns = num_columns
        df = df.drop(num_columns,axis=1).join(scaled_df)
        pickle.dump(sc,open('one_month_lag_scaler.p','wb'))
        
        le = LabelEncoder()
        label_encoding = le.fit_transform(df['region'])
        df['region'] = label_encoding
        pickle.dump(le,open('region_label_encoding.p','wb'))
        
    else:
        sc = pickle.load(open('one_month_lag_scaler.p','rb'))
        scaled_columns = sc.transform(df[num_columns])
        scaled_df = pd.DataFrame(scaled_columns)
        scaled_df.columns = num_columns
        df = df.drop(num_columns,axis=1).join(scaled_df)
        
        le = pickle.load(open('region_label_encoding.p','rb'))
        df['region'] = le.transform(df['region'])
        
    
    return df

In [None]:
train = preprocessing(train,train=True)
test = preprocessing(test,train=False)
future = preprocessing(future,train=False)

# Model fitting and Cross Validation

In [None]:
y = train['averageprice']
date = train['date']
X = train.drop(['date','averageprice','type'],axis=1)

In [None]:
params = {
'learning_rate':0.05, 
'n_estimators': 10000,
'colsample_bytree': 0.8, 
'gamma': 0.3, 'max_depth': 7, 
'min_child_weight': 4, 
'subsample': 0.6
}


In [None]:
folds = TimeSeriesSplit(n_splits=5)
predictions=[]
for i, (train_index,test_index) in enumerate(folds.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    date_train, date_test = date[train_index], date[test_index]
    eval_set =  [(X_test, y_test)]
    model = XGBRegressor(**params)
    model.fit(X_train,y_train,eval_metric="rmse", eval_set=eval_set, early_stopping_rounds=100,verbose=200)
    pred=model.predict(X_test)
    print(f"Validation RMSE for fold {i}: {round(np.sqrt(mean_squared_error(y_test,pred)),3)}\n")
    predictions.append(pd.DataFrame({'fold':'fold_'+str(i),'date': date_test,'region':X_test.region,'actual_price':y_test, 'predicted_price':pred}))

In [None]:
all_preds = pd.concat(predictions)

In [None]:
le = pickle.load(open('region_label_encoding.p','rb'))
all_preds.region = le.inverse_transform(all_preds.region)
train.region = le.inverse_transform(train.region)

# Visualization of Prediction - Holdout

In [None]:
# Visualize prediction for a region
region = 'Denver'

In [None]:
# Getting the time series from train and test dataframes
train_dates = train.loc[(train.region==region), 'date']
test_dates = all_preds.loc[(all_preds.fold=='fold_4')&(all_preds.region==region), 'date']
train_values = train.loc[(train.region==region), 'averageprice']
test_values = all_preds.loc[(all_preds.fold=='fold_4')&(all_preds.region==region), 'actual_price']
test_predictions = all_preds.loc[(all_preds.fold=='fold_4')&(all_preds.region==region), 'predicted_price']

# Getting the error
rmse = round(np.sqrt(mean_squared_error(test_values,test_predictions)),3)
# Plotting the predictions
fig, ax = plt.subplots(1, 1, figsize=(15, 8));
ax.plot(train_dates,train_values, color='blue', label='Training Data');
ax.plot(test_dates, test_predictions, color='green', marker='o',label='Predicted Price');

ax.plot(test_dates, test_values, color='red', label='Actual Price');
ax.set_title(f'{region} region -  Avocado Prices Prediction - {avocado_type} \nRMSE: {rmse}');
ax.set_xlabel('Dates');
ax.set_ylabel('Prices');
ax.legend();

# Final Model

In [None]:
# Predicting for test set and visualization
params = {
'learning_rate':0.05, 
'n_estimators': 200,
'colsample_bytree': 0.8, 
'gamma': 0.3, 'max_depth': 7, 
'min_child_weight': 4, 
'subsample': 0.6
}
model = XGBRegressor(**params)
model.fit(X,y)
test_preds = model.predict(test[X.columns])
test['predicted_price'] = test_preds

# Visulaization of prediction - Test

In [None]:
X.region = le.inverse_transform(X.region)
test.region = le.inverse_transform(test.region)

In [None]:
# Getting the time series from train and test dataframes
train_dates = train.loc[(train.region==region), 'date']
test_dates = test.loc[(test.region==region), 'date']
train_values = train.loc[(train.region==region), 'averageprice']
test_values = test.loc[(test.region==region), 'averageprice']
test_predictions = test.loc[(test.region==region), 'predicted_price']

# Getting the error
rmse = round(np.sqrt(mean_squared_error(test_values,test_predictions)),3)
# Plotting the predictions
fig, ax = plt.subplots(1, 1, figsize=(15, 8));
ax.plot(train_dates,train_values, color='blue', label='Training Data');
ax.plot(test_dates, test_predictions, color='green', marker='o',label='Predicted Price');

ax.plot(test_dates, test_values, color='red', label='Actual Price');
ax.set_title(f'{region} region -  Avocado Prices Prediction - {avocado_type} \nRMSE: {rmse}');
ax.set_xlabel('Dates');
ax.set_ylabel('Prices');
ax.legend();

# Feature Importance

In [None]:
importance = model.get_booster().get_score(importance_type= 'gain')
importance_df = pd.DataFrame(list(importance.items()), columns = ['feature','importance'])
importance_df = importance_df.sort_values('importance',ascending=False)
plt.figure(figsize=(8,10));
sns.barplot(importance_df.importance,importance_df.feature);

> ** The important features turned out to be the lag values and moving average of price. The volume and bag counts are in the top 10 features but theie effects are rather small. This suggests that classic time series models would be better in predicting the price.**

# Shapley values

In [None]:
import shap
import warnings
explainer = shap.TreeExplainer(model)
expected_value = explainer.expected_value
if isinstance(expected_value, list):
    expected_value = expected_value[1]
print(f"Explainer expected value: {expected_value}")

In [None]:
test.region = le.transform(test.region)
X_features = test[X.columns]

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    shap_values = explainer.shap_values(X_features)
    shap_interaction_values = explainer.shap_interaction_values(X_features)
if isinstance(shap_interaction_values, list):
    shap_interaction_values = shap_interaction_values[1]

In [None]:
shap.decision_plot(expected_value, shap_values, X_features)

> ** Shapley values analysis also shows that time related variables are more important for the model in predicting the prices, especially the lag values**

# Future Prediction

In [None]:
future_preds = model.predict(future[X.columns])
future['predicted_price'] = future_preds

In [None]:
test.region = le.inverse_transform(test.region)
future.region = le.inverse_transform(future.region)

In [None]:
# Getting the time series from train and test dataframes
train_dates = final_train.loc[(final_train.region==region), 'date']
future_dates = future.loc[(future.region==region), 'date']
train_values = final_train.loc[(final_train.region==region), 'averageprice']
future_values = future.loc[(future.region==region), 'predicted_price']

# Getting the error
rmse = 0#round(np.sqrt(mean_squared_error(test_values,test_predictions)),3)
# Plotting the predictions
fig, ax = plt.subplots(1, 1, figsize=(15, 8));
ax.plot(train_dates,train_values, color='blue', label='Training Data');
ax.plot(future_dates, future_values, color='green', marker='o',label='Future Price');

ax.set_title(f'{region} region -  Avocado Prices Prediction - {avocado_type} \nRMSE: {rmse}');
ax.set_xlabel('Dates');
ax.set_ylabel('Prices');
ax.legend();

In [None]:
future.loc[(future.region==region), ['date','region','type','predicted_price']]