<a href="https://colab.research.google.com/github/YousefBarty/ML-Projects/blob/main/MLPACK_avocado_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install mlpack

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mlpack
  Downloading mlpack-3.4.2-cp37-cp37m-manylinux1_x86_64.whl (93.0 MB)
[K     |████████████████████████████████| 93.0 MB 72 kB/s 
Installing collected packages: mlpack
Successfully installed mlpack-3.4.2


In [None]:
import mlpack
from mlpack import linear_regression
import pandas as pd
import numpy as np

In [None]:
avocadoData = pd.read_csv("avocado.csv", index_col=0)

In [None]:
avocadoData.head()


Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [None]:
avocadoData['Date'] = pd.to_datetime(avocadoData.Date)
avocadoData.sort_values(by = ['Date'], inplace = True, ascending = True)

In [None]:
meanDates = avocadoData.groupby("Date").mean()


In [None]:
features = ["4046", "4225", "4770", "Small Bags", "Large Bags", "XLarge Bags", "type", "year", "region"]


In [None]:
# Split data into features (X) and targets (y).

X = avocadoData[features]
y = avocadoData["AveragePrice"]
y = np.log1p(y)


In [None]:
# Utility functions for onehot encoding.

def one_hot_encode(data, dimensions, drop=False):
    data = data.copy()
    for dim in dimensions:
        if(type(data.iloc[:,dim].values[0]) == str):
            uniq = data.iloc[:, dim].unique()
            for val in uniq:
                data[f"{data.columns[dim]}_{val}"] = data.iloc[:,dim].apply(lambda x: 1 if x == val else 0)
    if drop:
        data.drop(data.columns[dimensions], axis=1, inplace=True)
    return data

In [None]:
X = one_hot_encode(X, [6,8], True)


In [None]:
train_len = len(X)
train_idxs = list(range(train_len))
np.random.shuffle(train_idxs)
split = int(np.floor(0.2 * train_len))
Xtest = X.iloc[:split, :].values
Xtrain = X.iloc[split:, :].values
ytest = y.iloc[:split].values
ytrain = y.iloc[split:].values

In [None]:
# Create and train Linear Regression model.
output = mlpack.linear_regression(training=Xtrain, training_responses=ytrain, lambda_=0.5, verbose=True)

In [None]:
model = output["output_model"]


In [None]:
predictions = mlpack.linear_regression(input_model=model, test=Xtest)


In [None]:
yPreds = predictions["output_predictions"].reshape(-1, 1).squeeze()


In [None]:

def mae(y_true, y_preds):
    return np.mean(np.abs(y_preds - y_true))

def mse(y_true, y_preds):
    return np.mean(np.power(y_preds - y_true, 2))

In [None]:
print("---- Evaluation Metrics ----")
print(f"Mean Absoulte Error: {mae(ytest, yPreds):.2f}")
print(f"Mean Squared Error: {mse(ytest, yPreds):.2f}")
print(f"Root Mean Squared Error: {np.sqrt(mse(ytest, yPreds)):.2f}")

---- Evaluation Metrics ----
Mean Absoulte Error: 0.06
Mean Squared Error: 0.01
Root Mean Squared Error: 0.08
