# APPLE GOLDEN DELICIOUS PREDICTION
***

Our task is to predict the average price per apple of APPLE GOLDEN DELICIOUS.

# GETTING STARTED

Importing necessary libraries and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import scipy
import pickle
%matplotlib inline

from sklearn.preprocessing import OrdinalEncoder
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score, RandomizedSearchCV

from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import metrics
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn import metrics
from sklearn.ensemble import StackingRegressor
from scipy.stats import randint as sp_randint

import warnings
warnings.filterwarnings(action='ignore')


In [None]:
# get data

train = pd.read_csv('df - train_set.csv')
test = pd.read_csv('df - test_set.csv')

train = train[(train['Commodities'] == 'APPLE GOLDEN DELICIOUS')]
del train['Commodities']
del test['Commodities']

train.head()

**EXPLORATORY DATA ANALYSIS**

In [None]:
train.shape

In [None]:
test.shape

In [None]:
test.head()

In [None]:
# getting info
train.info()

In [None]:
train['Province'].unique()

In [None]:
train['Container'].unique()

In [None]:
train['Size_Grade'].unique()

In [None]:
train.describe()

**Pairplot**

Pairplot  to visualize the relationship between numeric and categorical data.
.

In [None]:
sns.pairplot(train)

**Heatmap**

We used a heatmap to understand correlation between variables.

In [None]:
sns.heatmap(train.corr(), annot=True, cmap='magma')

- According to the above correlation matrix Low price, High price, Sales total, Total kg sold, Stock on hand,Total Qty sold, are highly correlated. 
- Highly correlated variables will cause the model to be biased towards them.

**Distribution Plot**

Understanding the distribution of our target 


In [None]:
sns.distplot(train['avg_price_per_kg'],kde =True)

The data is normally distributed

Preprocesing Categorical data

In [None]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column, drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
def preprocessing_inputs(df, return_df=False):
    df = df.copy()

    # CLEAN PROVINCE COLUMN

    df['Province'] = df['Province'].str.replace(' ', '_')
    df['Province'] = df['Province'].str.replace('.', '_')
    df['Province'] = df['Province'].str.replace('-', '_')

    # DATE ENCODING
    # Split 'Date' column into year, month and day columns

    df['Date'] = pd.to_datetime(df['Date'])
    df['Date'] = df['Date'].dt.strftime('%d.%m.%Y')
    df['year'] = pd.DatetimeIndex(df['Date']).year
    df['month'] = pd.DatetimeIndex(df['Date']).month
    df['day'] = pd.DatetimeIndex(df['Date']).day

    df = df.drop(['Date'], axis=1)
    # BINARY ENCODING

    df['year'] = df['year'].replace({2020: 1, 2019: 0})

    # ONE-HOT ENCODING
    for column in ['Province', 'Container']:
        df = onehot_encode(df, column)

    # ORDINAL ENCODING
    df['Size_Grade'] = df['Size_Grade'].replace({
        '1X': 9,
        '1L': 8,
        '1M': 7,
        '1S': 6,
        '1U': 0,
        '2X': 3,
        '2L': 5,
        '2M': 4,
        '2S': 1,
        '2U': 2
    })

    return df

**Lets check for outliers.**

In [None]:
X = preprocessing_inputs(train)

In [None]:
out_df = X.drop('avg_price_per_kg', axis=1).copy()

categorical_columns = [
    column for column in out_df.columns if len(out_df[column].unique()) > 2
]
# Thisisto make sure we dont use categorical data
plt.figure(figsize=(20, 20))

for i, column in enumerate(categorical_columns):
    plt.subplot(5, 5, i + 1)
    sns.boxplot(data=out_df[column].values, color='darkviolet')
    plt.title(column)

plt.suptitle("Boxplots With Outliers", size=30)
plt.show()

- As we can see from the above boxplot the following features (weight kg, low price, high price,sales total, total qty sold,   total kg sold, stock on hand) may have possible outliers.
- We will use Z scores for our analysis
- Any feature that lies outside a z score threshold will be considered an outlier 
- Making the assumption that the variable follows a normal distribution
- The further away it is from 0( i.e mean) the more extreme it is ,the more likely it is to be an outlier

In [None]:
def remove_outliers(df, columns, threshold, asxis=0):
    df = df.copy()
    # Calculate the lower and upper bounds on the Z distribution given a threshold value
    lower_bound = scipy.stats.norm.ppf(q=threshold / 2, loc=0, scale=1)
    upper_bound = scipy.stats.norm.ppf(q=1 - threshold / 2, loc=0, scale=1)

    # Calculate X scores for affected columns
    outlier_df = outlier_df = df.loc[:, columns].copy()
    zscores = pd.DataFrame(scipy.stats.zscore(outlier_df, axis=0),
                           index=outlier_df.index,
                           columns=outlier_df.columns)

    # Get boolean arrays denoting the outlier examples
    lower_outliers = (zscores < lower_bound).any(axis=1)
    upper_outliers = (zscores >= upper_bound).any(axis=1)

    # Get indicies of all outlier examples
    outliers = df[pd.concat([lower_outliers, upper_outliers],
                            axis=1).any(axis=1)].index

    # Drop the outliers
    df = df.drop(outliers, axis=0).reset_index(drop=True)
    #print(len(outliers), "examples dropped.")

    return df

In [None]:
# PREPROCESSING
seed = 1


def preprocess_inputs(df, return_df=False):
    df = df.copy()

    # CLEAN PROVINCE COLUMN

    df['Province'] = df['Province'].str.replace(' ', '_')
    df['Province'] = df['Province'].str.replace('.', '_')
    df['Province'] = df['Province'].str.replace('-', '_')

    # DATE ENCODING
    # Split 'Date' column into year, month and day columns

    df['Date'] = pd.to_datetime(df['Date'])
    df['Date'] = df['Date'].dt.strftime('%d.%m.%Y')
    df['year'] = pd.DatetimeIndex(df['Date']).year
    df['month'] = pd.DatetimeIndex(df['Date']).month
    df['day'] = pd.DatetimeIndex(df['Date']).day

    df = df.drop(['Date'], axis=1)

    # BINARY ENCODING

    df['year'] = df['year'].replace({2020: 1, 2019: 0})

    # ONE-HOT ENCODING
    for column in ['Province', 'Container']:
        df = onehot_encode(df, column)

    # ORDINAL ENCODING
    enc = OrdinalEncoder()
    df[['Size_Grade']] = enc.fit_transform(df[['Size_Grade']])

    # REMOVE OUTLIERS from Train set
    if 'avg_price_per_kg' in df.columns:

        df = remove_outliers(df=df,
                             columns=[
                                 'Weight_Kg', 'Low_Price', 'High_Price',
                                 'Sales_Total', 'Total_Qty_Sold',
                                 'Total_Kg_Sold', 'Stock_On_Hand'
                             ],
                             threshold=0.0000000000001)

    if return_df == True:
        ## for training dataset
        # REORDER COLUMNS SO THAT OUR DEPENDENT VARIABLE IS THE LAST COLUMN OF THE DATAFRAME
        if 'avg_price_per_kg' in df.columns:
            column_titles = [
                col for col in df.columns if col != 'avg_price_per_kg'
            ] + ['avg_price_per_kg']
            df = df.reindex(columns=column_titles)

        return df

    ## for training dataset
    # REORDER COLUMNS SO THAT OUR DEPENDENT VARIABLE IS THE LAST COLUMN OF THE DATAFRAME
    elif 'avg_price_per_kg' in df.columns:
        column_titles = [
            col for col in df.columns if col != 'avg_price_per_kg'
        ] + ['avg_price_per_kg']
        df = df.reindex(columns=column_titles)

        # SPLIT DATA INTO PREDICTORS AND TARGET

        y = df['avg_price_per_kg']
        X = df.drop('avg_price_per_kg', axis=1)
        y = np.array(y)
        
        # TRAIN TEST SPLIT
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.35,
                                                            shuffle=False,
                                                            random_state=seed)

        return X_train, X_test, y_train, y_test

    else:
        return df

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(train)

In [None]:
# TRAIN A FEW MODELS

models = {
    "                         Decision Tree": DecisionTreeRegressor(random_state= seed),
    "                         Random Forest": RandomForestRegressor(min_samples_leaf= 1, n_estimators = 500, random_state= seed, max_depth = 13),
    "                     Gradient Boosting": GradientBoostingRegressor(learning_rate=0.18, n_estimators=6000, random_state= seed, max_depth =2),
    "                               XGBoost": XGBRegressor(max_depth=2,min_child_weight=13,subsample=1,colsample_bytree=1,
            objective='reg:squarederror',n_estimators=6000, learning_rate=0.3, random_state= seed),
    "                     CatBoostRegressor": CatBoostRegressor(verbose=0, learning_rate=0.09, depth = 4, iterations= 7000),
    "                         LGBMRegressor": LGBMRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

In [None]:
#EVALUATE MODEL ON R SQUARED - HIGHER IS BETTER

for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(X_test, y_test)))
    

In [None]:
#EVALUATE MODEL ON RMSE - LOWER IS BETTER

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(name + " Test RMSE: {:.5f}".format(np.sqrt(metrics.mean_squared_error(y_test ,y_pred))))
    
    y_train_pred = model.predict(X_train)
    print(name + " Train RMSE: {:.5f}".format(np.sqrt(metrics.mean_squared_error(y_train_pred ,y_train))))
    
    errors = abs(y_pred - y_test)

    # Display the performance metrics
    print('Mean Absolute Error:', round(np.mean(errors), 2), 'Rand.')

    mape = np.mean(100 * (errors / y_test))
    accuracy = 100 - mape

    print('Accuracy:', round(accuracy, 2), '%.')

# FEATURE SELECTION

In [None]:
# FEAUTURE IMPORTANCES

df = train.drop('avg_price_per_kg', axis = 1)
features = preprocess_inputs(df)
feature_list = list(features.columns)


for name, model in models.items():
    if name == "                               XGBoost":
        # Get numerical feature importances
        importances = list(model.feature_importances_)

        # List of tuples with variable and importance
        feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

        # Sort the feature importances by most important first
        feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

        # Print out the feature and importances 
        [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];



In [None]:
#SELECTING IMPORTANT FEATURES

df = preprocess_inputs(train, return_df=True)


y = df['avg_price_per_kg']
X = df.drop('avg_price_per_kg', axis=1)

# GET IMPORTANT COLUMN NAMES

important = []
for i in [ 'Total_Kg_Sold', 'Container_IA400', 'Container_M4183', "Container_JE090", 'Container_JG110', 
          'Weight_Kg', 'Total_Qty_Sold', 'High_Price', 'Sales_Total', 'Stock_On_Hand']:
    A = [col for col in df.columns if i in col]
    important.append(A)
    
important_list = [item for sublist in important for item in sublist]

# IMPORTANT DATAFRAME

X_imp = X[important_list]
print(important_list)


In [None]:
# TEST NEW MODEL WITH IMPORTANT FEATURES ONLY

X_imp_train, X_imp_test, y_train, y_test = train_test_split(X_imp, y, test_size=0.35, shuffle=False, random_state=seed)

xgb = XGBRegressor(max_depth=2,min_child_weight=13,subsample=1,colsample_bytree=1,
            objective='reg:squarederror',n_estimators=6000, learning_rate=0.3, random_state= seed)
xgb.fit(X_imp_train, y_train)
print("Trained.")

In [None]:
# CHECK PERFORMANCE METRICS

pred = xgb.predict(X_imp_test)

errors = abs(pred - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'rand.')

mape = np.mean(100 * (errors / y_test))
accuracy = 100 - mape

print('Accuracy:', round(accuracy, 2), '%.')
print("Test RMSE: {:.5f}".format(np.sqrt(metrics.mean_squared_error(y_test ,pred))))
y_train_pred = xgb.predict(X_imp_train)
print("Train RMSE: {:.5f}".format(np.sqrt(metrics.mean_squared_error(y_train_pred ,y_train))))

Lets compare all the models with their previous score.

In [None]:
# TRAIN MODELS AGAIN BUT WITH THE SUBSET CREATED ABOVE

X_imp_train, X_imp_test, y_train, y_test = train_test_split(X_imp, y, test_size=0.35, shuffle=False, random_state=seed)
# TRAIN A FEW MODELS

models = {
    "                         Decision Tree": DecisionTreeRegressor(random_state= seed),
    "                         Random Forest": RandomForestRegressor(min_samples_leaf= 1, n_estimators = 500, random_state= seed, max_depth = 13),
    "                     Gradient Boosting": GradientBoostingRegressor(learning_rate=0.18, n_estimators=6000, random_state= seed, max_depth =2),
    "                               XGBoost": XGBRegressor(max_depth=2,min_child_weight=13,subsample=1,colsample_bytree=1,
            objective='reg:squarederror',n_estimators=6000, learning_rate=0.3, random_state= seed),
    "                     CatBoostRegressor": CatBoostRegressor(verbose=0, learning_rate=0.09, depth = 4, iterations= 7000),
    "                         LGBMRegressor": LGBMRegressor()
}

for name, model in models.items():
    model.fit(X_imp_train, y_train)
    print(name + " trained.")

In [None]:
#EVALUATE SQUARED - HIGHER IS BETTER

for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(X_imp_test, y_test)))
 

In [None]:
#EVALUATE MODEL ON RMSE

X_imp_train, X_imp_test, y_train, y_test = train_test_split(X_imp, y, test_size=0.35, shuffle=False, random_state=seed)

for name, model in models.items():
    y_pred = model.predict(X_imp_test)
    print(name + " Test RMSE: {:.5f}".format(np.sqrt(metrics.mean_squared_error(y_test ,y_pred))))
    
    y_train_pred = model.predict(X_imp_train)
    print(name + " Train RMSE: {:.5f}".format(np.sqrt(metrics.mean_squared_error( y_train,y_train_pred))))
    
    errors = abs(y_pred - y_test)

    # Display the performance metrics
    print('Mean Absolute Error:', round(np.mean(errors), 2), 'Rand.')

    mape = np.mean(100 * (errors / y_test))
    accuracy = 100 - mape

    print('Accuracy:', round(accuracy, 2), '%.')

It apears the majority of the best performing models from our initial training have improved. Lets improve them a bit more.

## Ensemble Stacking

In [None]:
# NOW ENSEMBLE STACKING

# CHOOSE BEST MODELS FROM EARLIER SCORES

gb = GradientBoostingRegressor(learning_rate=0.18, n_estimators=6000, random_state= seed, max_depth =2)
xgb = XGBRegressor(max_depth=2,min_child_weight=13,subsample=1,colsample_bytree=1,
            objective='reg:squarederror',n_estimators=6000, learning_rate=0.3, random_state= seed)
meta_learner_reg =  XGBRegressor(max_depth=2,min_child_weight=13,subsample=1,colsample_bytree=1, 
                                 objective='reg:squarederror', n_estimators=6000, learning_rate=0.3, random_state= seed)

models_4stacking = [("gb", gb),("xgb", xgb)]

s_reg = StackingRegressor(estimators=models_4stacking, final_estimator= meta_learner_reg, passthrough = True, cv= 4)


In [None]:
# TRAIN ON DATAFRAME WITH 7 COLUMNS(MOST IMPORTANT)

X_imp_train, X_imp_test, y_train, y_test = train_test_split(X_imp, y, test_size=0.35, shuffle=False, random_state=seed)

s_reg.fit(X_imp_train,y_train)
print("Stacked model fitted.")

In [None]:
X_imp_train, X_imp_test, y_train, y_test = train_test_split(X_imp, y, test_size=0.35, shuffle=False, random_state=seed)

In [None]:
# METRICS OF STACKING REGRESSOR

y_pred = s_reg.predict(X_imp_test)
rsq = s_reg.score(X_imp_test, y_test)
print("R^2 Score: ", rsq)

print("Test RMSE: {:.5f}".format(np.sqrt(metrics.mean_squared_error(y_test ,y_pred))))

y_train_pred = model.predict(X_imp_train)
print("Train RMSE: {:.5f}".format(np.sqrt(metrics.mean_squared_error(y_train_pred ,y_train))))
errors = abs(y_pred - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'Rand.')
mape = np.mean(100 * (errors / y_test))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
# SAVING STACKED REGRESSOR PREDICTIONS to CSV

df = preprocess_inputs(test)
Xs = list(df.columns)
Xs.remove('Index')

X_test = df[Xs]

x_t = X_test[important_list]

y_pred = s_reg.predict(x_t)
d = pd.DataFrame(y_pred, columns =['avg_price_per_kg'])
dff = pd.concat([df['Index'], d], axis=1)
dff = dff.set_index('Index')
dff.to_csv('StackeD.csv')




## KFold Split

In [None]:
# USING KFOLD

df = preprocess_inputs(train, return_df=True)

y = df['avg_price_per_kg']
X = df.drop('avg_price_per_kg', axis=1)

important = []
for i in [ 'Total_Kg_Sold', 'Container_IA400', 'Container_M4183', "Container_JE090", 'Container_JG110', 
          'Weight_Kg', 'Total_Qty_Sold', 'High_Price', 'Sales_Total', 'Stock_On_Hand']:
    A = [col for col in df.columns if i in col]
    important.append(A)
    
important_list = [item for sublist in important for item in sublist]

# IMPORTANT DATAFRAME
X_imp = X[important_list]

def split_data_kf(df,K):
    
    y = df['avg_price_per_kg']
    X = df[important_list]  # SUBSET FOR IMPORTANT ROWS
    
    kf = KFold(n_splits=K, shuffle = False)
    indices = []
    
    for train_index, test_index in kf.split(X):
        indices.append((train_index, test_index))
        
    return indices


def get_best_kfmodel(df,data_indices):
    y = df['avg_price_per_kg']
    X = df[important_list]
    
    RMSE = []
    trainRMSE =[]
    
    for (train_indices,test_indices) in data_indices:
        X_train, y_train = X.iloc[train_indices,:],y.iloc[train_indices]
        X_test, y_test = X.iloc[test_indices,:], y.iloc[test_indices]
        
        model = StackingRegressor(estimators=models_4stacking, final_estimator= meta_learner_reg, passthrough = True, cv= 4)
        
        model.fit(X_train.values, y_train.values) 

        y_pred = model.predict(X_test.values)     
        
        rmse = np.sqrt(metrics.mean_squared_error(y_test.values ,y_pred))
        RMSE.append(rmse)
        
        y_train_pred = model.predict(X_train.values)
        
        train_rsme = np.sqrt(metrics.mean_squared_error( y_train.values,y_train_pred))
        trainRMSE.append(train_rsme)
        
    best = RMSE.index(min(RMSE))
    best_indicies = data_indices[best]
    
    
    X_train, y_train = X.iloc[best_indicies[0],:],y.iloc[best_indicies[0]]
    X_test, y_test = X.iloc[best_indicies[1],:], y.iloc[best_indicies[1]]
    
    model = StackingRegressor(estimators=models_4stacking, final_estimator= meta_learner_reg, passthrough = True, cv= 4)
    
    model.fit(X_train.values, y_train.values)       
          
    return model


    
# GET THE KFOLD SPLIT ON WHICH THE MODEL WAS TRAINED
def get_best_split(df,data_indices, model):
    y = df['avg_price_per_kg']
    X = df[important_list]
    
    RMSE = []
    trainRMSE =[]
    
    for (train_indices,test_indices) in data_indices:
        X_train, y_train = X.iloc[train_indices,:],y.iloc[train_indices]
        X_test, y_test = X.iloc[test_indices,:], y.iloc[test_indices]

        y_pred = model.predict(X_test.values)     
        
        rmse = np.sqrt(metrics.mean_squared_error(y_test.values ,y_pred))
        RMSE.append(rmse)
        
        y_train_pred = model.predict(X_train.values)
        
        train_rsme = np.sqrt(metrics.mean_squared_error( y_train.values,y_train_pred))
        trainRMSE.append(train_rsme)
        
    best = RMSE.index(min(RMSE))
    best_indicies = data_indices[best]
    
    X_train, y_train = X.iloc[best_indicies[0],:],y.iloc[best_indicies[0]]
    X_test, y_test = X.iloc[best_indicies[1],:], y.iloc[best_indicies[1]]
          
    return X_train.values, X_test.values, y_train.values, y_test.values
              

In [None]:
# Lets Find our best model

train_df = preprocess_inputs(train, return_df = True)
data_indices = split_data_kf(train_df,4)
model = get_best_kfmodel(train_df,data_indices)

print('Best Stacking Regressor Trained.')

In [None]:
# Lets save our model
#  #Pickling

model_save_path = "assets/trained-models/stacked_kfoldarr.pkl"
with open(model_save_path,'wb') as file:
    pickle.dump(model,file)


In [None]:
# Unpickle the model

model_load_path = "assets/trained-models/stacked_kfoldarr.pkl"
with open(model_load_path,'rb') as file:
    unpickled_model = pickle.load(file)



In [None]:
# GET METRICS OF STACKING REGRESSOR MODEL

df = preprocess_inputs(train, return_df = True)
data_indices = split_data_kf(df,4)

X_train, X_test, y_train, y_test = get_best_split(df, data_indices, unpickled_model)
s_reg = unpickled_model 

y_pred = s_reg.predict(X_test)
rsq = s_reg.score(X_test, y_test)
print("R^2 Score: ", rsq)

print("Test RMSE: {:.5f}".format(np.sqrt(metrics.mean_squared_error(y_test ,y_pred))))

y_train_pred = s_reg.predict(X_train)
print("Train RMSE: {:.5f}".format(np.sqrt(metrics.mean_squared_error(y_train_pred ,y_train))))
errors = abs(y_pred - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'Rand.')
mape = np.mean(100 * (errors / y_test))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
# Predict with best model
df = preprocess_inputs(test)
Xs = list(df.columns)
Xs.remove('Index')

X_test = df[Xs]
X_Test_imp = X_test[important_list]

y_pred = unpickled_model.predict(X_Test_imp.values)
d = pd.DataFrame(y_pred, columns =['avg_price_per_kg'])
dff = pd.concat([df['Index'], d], axis=1)
dff = dff.set_index('Index')
dff.to_csv('stackedkfoldarr.csv')

In [None]:
# Assess model accuracy with plot of the predicted y values against the actual y values from x test split
# Stacking Regressor of Gradient Boosting and XGBooster Regressor - our best model

df = preprocess_inputs(train, return_df = True)
data_indices = split_data_kf(df,4)

X_train, X_test, y_train, y_test = train_test_split(X_imp, y, test_size=0.35, shuffle=False, random_state=seed)
s_reg = unpickled_model 

y_pred = s_reg.predict(X_test)
rsq = round(s_reg.score(X_test, y_test), 5)
test_rmse = round(np.sqrt(metrics.mean_squared_error(y_test ,y_pred)), 5)
y_train_pred = s_reg.predict(X_train)
train_rmse = round(np.sqrt(metrics.mean_squared_error(y_train_pred ,y_train)), 5)

errors = abs(y_pred - y_test)
MAE = round(np.mean(errors), 2)
mape = np.mean(100 * (errors / y_test))
accuracy = round((100 - mape), 2)


stats = f"test RMSE = {test_rmse} \n " + \
          f"train RMSE = {train_rmse} \n" + \
          f"R^2 Score = {rsq} \n" + \
         f"MAE = {MAE} \n" + \
         f"Accuracy = {accuracy}" 


fig, ax = plt.subplots(figsize=(12,12))
ax.plot(y_test, y_test, 'r')
g=ax.scatter(y_test, y_pred, s = 9)
g.axes.set_xlabel('True Values')
g.axes.set_ylabel('Predictions')
g.axes.set_title('Predictions vs Actual\n \
                Stacking Regressor of \
                 \n Gradient Boosting and XGBooster Regressor\n \n' + stats)


In [None]:
# lets compare the performance of this stacked model to the individual models performances 

#first lets train them again

df = preprocess_inputs(train, return_df=True)
y = df['avg_price_per_kg']
X = df.drop('avg_price_per_kg', axis=1)
important_list = ['Total_Kg_Sold', 'Container_IA400', 'Container_M4183', 'Container_JE090', 
 'Container_JG110', 'Weight_Kg', 'Total_Qty_Sold', 'High_Price', 'Sales_Total', 'Stock_On_Hand']
X_imp = X[important_list]

X_train, X_test, y_train, y_test = train_test_split(X_imp, y, test_size=0.35, shuffle=False, random_state=seed)


gb = GradientBoostingRegressor(learning_rate=0.18, n_estimators=6000, random_state= seed, max_depth =2)
xgb = XGBRegressor(max_depth=2,min_child_weight=13,subsample=1,colsample_bytree=1,
            objective='reg:squarederror',n_estimators=6000, learning_rate=0.3, random_state= seed)

xgb.fit(X_train, y_train)

gb.fit(X_train, y_train)
print('Trained.')

In [None]:
# Assess model accuracy with plot of the predicted y values against the actual y values from x test split
# Gradient Boosting Regressor

model = gb

y_pred = model.predict(X_test)
rsq = round(model.score(X_test, y_test), 5)
test_rmse = round(np.sqrt(metrics.mean_squared_error(y_test ,y_pred)), 5)
y_train_pred = model.predict(X_train)
train_rmse = round(np.sqrt(metrics.mean_squared_error(y_train_pred ,y_train)), 5)

errors = abs(y_pred - y_test)
MAE = round(np.mean(errors), 2)
mape = np.mean(100 * (errors / y_test))
accuracy = round((100 - mape), 2)

stats = f"test RMSE = {test_rmse} \n " + \
          f"train RMSE = {train_rmse} \n" + \
          f"R^2 Score = {rsq} \n" + \
         f"MAE = {MAE} \n" + \
         f"Accuracy = {accuracy}" 

fig, ax = plt.subplots(figsize=(12,12))
ax.plot(y_test, y_test, 'r')
g=ax.scatter(y_test, y_pred, s = 9)
g.axes.set_xlabel('True Values')
g.axes.set_ylabel('Predictions')
g.axes.set_title('Predictions vs Actual \n Gradient Boosting Regressor\n \n' + stats)



In [None]:
# Assess model accuracy with plot of the predicted y values against the actual y values from x test split
# XGBooster Regressor

model = xgb

y_pred = model.predict(X_test)
rsq = round(model.score(X_test, y_test), 5)
test_rmse = round(np.sqrt(metrics.mean_squared_error(y_test ,y_pred)), 5)
y_train_pred = model.predict(X_train)
train_rmse = round(np.sqrt(metrics.mean_squared_error(y_train_pred ,y_train)), 5)

errors = abs(y_pred - y_test)
MAE = round(np.mean(errors), 2)
mape = np.mean(100 * (errors / y_test))
accuracy = round((100 - mape), 2)


stats = f"test RMSE = {test_rmse} \n " + \
          f"train RMSE = {train_rmse} \n" + \
          f"R^2 Score = {rsq} \n" + \
         f"MAE = {MAE} \n" + \
         f"Accuracy = {accuracy}" 

fig, ax = plt.subplots(figsize=(12,12))
ax.plot(y_test, y_test, 'r')
g=ax.scatter(y_test, y_pred, s = 9)
g.axes.set_xlabel('True Values')
g.axes.set_ylabel('Predictions')
g.axes.set_title('Predictions vs Actual \n XGBooster Regressor\n \n' + stats)