In [195]:
#STEP 1: IMPORT PYTHON LIBRARIES

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
 


import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sn


from sklearn.preprocessing import OneHotEncoder

In [196]:
#STEP 2: LOAD CSV DATA AS DATA FRAME 

train_set= pd.read_csv('df-train_set.csv')
test_set= pd.read_csv('df-test_set.csv')

#CREATE A NEW DATAFRAME / SLICE CURRENT DATAFRAME TO ONLY SHOW "GOLDEN DELICIOUS APPLES"

train = df_train_set[train_set.Commodities == "APPLE GOLDEN DELICIOUS"]
test = df_test_set[test_set.Commodities == "APPLE GOLDEN DELICIOUS"]


In [197]:
train

Unnamed: 0,Province,Container,Size_Grade,Weight_Kg,Commodities,Date,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,avg_price_per_kg
1,CAPE,M4183,1L,18.3,APPLE GOLDEN DELICIOUS,2020-09-09,150.0,170.0,51710.0,332,6075.6,822,8.51
7,CAPE,JG110,2M,11.0,APPLE GOLDEN DELICIOUS,2020-04-14,50.0,50.0,16000.0,320,3520.0,0,4.55
24,W.CAPE-BERGRIVER ETC,JE090,2S,9.0,APPLE GOLDEN DELICIOUS,2020-04-16,55.0,55.0,990.0,18,162.0,1506,6.11
40,CAPE,M4183,1S,18.3,APPLE GOLDEN DELICIOUS,2020-05-04,80.0,120.0,32020.0,388,7100.4,443,4.51
69,EASTERN CAPE,IA400,1S,400.0,APPLE GOLDEN DELICIOUS,2020-09-28,1800.0,1800.0,1800.0,1,400.0,2,4.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64276,NATAL,EC120,1S,12.0,APPLE GOLDEN DELICIOUS,2020-03-04,96.0,96.0,1920.0,20,240.0,53,8.00
64291,ORANGE FREE STATE,M4183,1X,18.3,APPLE GOLDEN DELICIOUS,2020-02-22,130.0,130.0,7280.0,56,1024.8,74,7.10
64297,CAPE,JE090,2M,9.0,APPLE GOLDEN DELICIOUS,2020-04-22,50.0,60.0,4540.0,89,801.0,793,5.67
64304,CAPE,JG110,2M,11.0,APPLE GOLDEN DELICIOUS,2020-03-04,30.0,40.0,2140.0,70,770.0,0,2.78


# Preprocessing

In [198]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix = column)
    df = pd.concat([df, dummies], axis = 1)
    df = df.drop(column, axis = 1)
    
    return df 

In [237]:
def preprocess_inputs(df):
    #create a copy of the df
    df = df.copy()
    
    #drop columns that won't be used
    df = df.drop(columns = ['Sales_Total', 'Total_Kg_Sold', 'Low_Price', 'High_Price', 'Commodities'], axis = 1)
    
    #convert date object to datetime and extract month 
    df['Date'] = pd.to_datetime(df['Date'])
    df['Date'] = df['Date'].dt.month
    
    
    #onehot encoding 
    for column in ['Province', 'Container', 'Size_Grade']:
        df = onehot_encode(df, column)
    
    
    #split df into X and y 
    y = df['avg_price_per_kg']
    X = df.drop('avg_price_per_kg', axis = 1)
    
    #Train test split 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, shuffle = True, random_state = 1)
    
    #scaling
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)
    
    return X_train, X_test, y_train, y_test 

In [238]:
X_train, X_test, y_train, y_test = preprocess_inputs(train)


In [236]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 195 entries, 26321 to 43114
Data columns (total 32 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Weight_Kg                      195 non-null    float64
 1   Date                           195 non-null    float64
 2   Total_Qty_Sold                 195 non-null    float64
 3   Stock_On_Hand                  195 non-null    float64
 4   Province_CAPE                  195 non-null    float64
 5   Province_EASTERN CAPE          195 non-null    float64
 6   Province_NATAL                 195 non-null    float64
 7   Province_ORANGE FREE STATE     195 non-null    float64
 8   Province_TRANSVAAL             195 non-null    float64
 9   Province_W.CAPE-BERGRIVER ETC  195 non-null    float64
 10  Province_WEST COAST            195 non-null    float64
 11  Container_AC030                195 non-null    float64
 12  Container_DT063                195 non-null 

In [260]:
x_see = X_train.drop(['Weight_Kg'], axis = 1)
x_see

Unnamed: 0,Date,Total_Qty_Sold,Stock_On_Hand,Province_CAPE,Province_EASTERN CAPE,Province_NATAL,Province_ORANGE FREE STATE,Province_TRANSVAAL,Province_W.CAPE-BERGRIVER ETC,Province_WEST COAST,...,Size_Grade_1L,Size_Grade_1M,Size_Grade_1S,Size_Grade_1U,Size_Grade_1X,Size_Grade_2L,Size_Grade_2M,Size_Grade_2S,Size_Grade_2U,Size_Grade_2X
59983,-0.218219,-0.439932,0.212325,0.932449,-0.333215,-0.254563,-0.067116,-0.088017,-0.541562,-0.264407,...,-0.359892,2.236928,-0.608687,-0.035817,-0.307686,-0.260222,-0.368916,-0.432518,-0.080296,-0.116775
24658,-0.589690,-0.467247,2.167230,-1.072445,-0.333215,-0.254563,-0.067116,-0.088017,1.846511,-0.264407,...,-0.359892,-0.447042,-0.608687,-0.035817,-0.307686,-0.260222,-0.368916,2.312040,-0.080296,-0.116775
57961,0.524725,0.819967,-0.485667,0.932449,-0.333215,-0.254563,-0.067116,-0.088017,-0.541562,-0.264407,...,-0.359892,-0.447042,-0.608687,-0.035817,-0.307686,3.842874,-0.368916,-0.432518,-0.080296,-0.116775
34730,-1.332634,0.021007,-0.553214,0.932449,-0.333215,-0.254563,-0.067116,-0.088017,-0.541562,-0.264407,...,-0.359892,-0.447042,-0.608687,-0.035817,-0.307686,-0.260222,-0.368916,-0.432518,12.453915,-0.116775
33743,0.896196,-0.460418,2.678472,0.932449,-0.333215,-0.254563,-0.067116,-0.088017,-0.541562,-0.264407,...,-0.359892,-0.447042,-0.608687,-0.035817,-0.307686,-0.260222,-0.368916,2.312040,-0.080296,-0.116775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60943,-1.704106,0.311228,-0.535996,-1.072445,-0.333215,-0.254563,-0.067116,-0.088017,-0.541562,3.782053,...,2.778609,-0.447042,-0.608687,-0.035817,-0.307686,-0.260222,-0.368916,-0.432518,-0.080296,-0.116775
43975,-1.704106,0.031250,-0.542619,-1.072445,-0.333215,-0.254563,-0.067116,-0.088017,1.846511,-0.264407,...,2.778609,-0.447042,-0.608687,-0.035817,-0.307686,-0.260222,-0.368916,-0.432518,-0.080296,-0.116775
63993,1.267668,0.215625,0.705024,0.932449,-0.333215,-0.254563,-0.067116,-0.088017,-0.541562,-0.264407,...,-0.359892,2.236928,-0.608687,-0.035817,-0.307686,-0.260222,-0.368916,-0.432518,-0.080296,-0.116775
11022,-1.332634,0.830211,-0.102897,0.932449,-0.333215,-0.254563,-0.067116,-0.088017,-0.541562,-0.264407,...,-0.359892,-0.447042,1.642879,-0.035817,-0.307686,-0.260222,-0.368916,-0.432518,-0.080296,-0.116775


# Training 

In [281]:
models = {
    "Linear Regression": LinearRegression(),
    "Linear Regression(L2 Regularization)": Ridge(),
    "Linear Regression(L1 Regularization)": Lasso(),
    "K-NearestNeighbors": KNeighborsRegressor(), 
    "Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "Support Vector Machine(RBF Kernel)": SVR(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Random Forest" : RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor()
}

for name, model in models.items():
    model.fit(X_train.drop(['Date'], axis = 1), y_train)
    print(name + ' trained.')

Linear Regression trained.
Linear Regression(L2 Regularization) trained.
Linear Regression(L1 Regularization) trained.
K-NearestNeighbors trained.




Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
Support Vector Machine(RBF Kernel) trained.
Decision Tree trained.
Random Forest trained.
Gradient Boosting trained.


# Results

In [283]:
for name, model in models.items():
    print(name+ ' R^2 Score:{:.5f}'.format(model.score(X_test.drop(['Date'], axis = 1), y_test)))

Linear Regression R^2 Score:0.57475
Linear Regression(L2 Regularization) R^2 Score:0.57478
Linear Regression(L1 Regularization) R^2 Score:-0.00156
K-NearestNeighbors R^2 Score:0.54018
Neural Network R^2 Score:0.61009
Support Vector Machine (Linear Kernel) R^2 Score:0.58094
Support Vector Machine(RBF Kernel) R^2 Score:0.63275
Decision Tree R^2 Score:0.27176
Random Forest R^2 Score:0.63112
Gradient Boosting R^2 Score:0.63949


In [None]:
#SPLIT THE DATASET TO GET INPUT DATA AND OUTPUT DATA 

X = X_train.drop(['Weight_Kg'], axis = 1)
y = y_train

x_test = test_preprocess_inputs(test)

#CREATING A MODEL AND TRAINING IT  

model =   RandomForestRegressor()
model.fit(X,y)


#CREATING A DATAFRAME OF PREDICTIONS 
predictions2 = pd.DataFrame({ 'Index': test['Index'], 'avg_price_per_kg': model.predict(x_test.drop(['Weight_Kg'], axis = 1))})
predictions2
