In [4]:
import numpy as np
import pandas as pd
# Data preparation and Evaluation
import os
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
# Models
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer
# Model evaluation
from sklearn.model_selection import cross_val_score
# Encoders
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# Dimensionality reduction
from sklearn.decomposition import PCA
# Data visualization
from tabulate import tabulate
import time

np.set_printoptions(precision=2, suppress=True)
np.random.seed(42)

In [5]:
input_path = "../data/slices_imputer/"


# df = pd.read_csv(input_path + "all_volume_components.csv")
# df = pd.read_csv(input_path + "total_volume_only.csv")
# df = df[df["type"] == "organic"]
# df = pd.read_csv(input_path + "all_volume_components_year.csv")
# df = pd.read_csv(input_path + "all_volume_components_year_week.csv")
df = pd.read_csv(input_path + "avocado.csv")
print("Sample from CSV read")
print(tabulate(df.sample(5), headers='keys', tablefmt='psql'))

# Splitting features and target
X, y = df.drop('average_price', axis=1), df['average_price']

Sample from CSV read
+-------+-----------------+---------+---------+--------+--------------+--------------+---------------+---------+----------------------+--------+--------+------------------+----------------+----------------+
|       |   average_price |    4046 |    4225 |   4770 |   small_bags |   large_bags |   xlarge_bags | type    | geography            |   year |   week | location_level   |   location_lat |   location_lon |
|-------+-----------------+---------+---------+--------+--------------+--------------+---------------+---------+----------------------+--------+--------+------------------+----------------+----------------|
| 27482 |            1.11 |   86    |  900    |   0    |     22281    |      3141    |             0 | organic | Nashville            |   2019 |     45 | Level 3          |        36.1627 |       -86.7816 |
| 10004 |            1.49 | 1073.85 |  114.5  |   0    |      4522.54 |       383.95 |             0 | organic | Pittsburgh           |   2016 |     36

In [6]:
numeric_features = ["4046","4225","4770","small_bags","large_bags","xlarge_bags", "year", "week"]

numeric_transformer_no_scaling = Pipeline(
    steps=[
        ("dim reduction", PCA()),
        ("scaler", StandardScaler(with_mean=False, with_std=False)) # Not scaling
    ]
)

categorical_features = ["type", "geography"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder())
    ]
)

decision_tree_preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_no_scaling, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

decision_tree_pipeline = Pipeline(
    steps=[
        ("preprocessor", decision_tree_preprocessor), 
        ("decision_tree", tree.DecisionTreeRegressor())
    ]
)

decision_tree_pipeline_depth_20 = Pipeline(
    steps=[
        ("preprocessor", decision_tree_preprocessor), 
        ("decision_tree", tree.DecisionTreeRegressor(max_depth=20))
    ]
)

decision_tree_pipeline

In [7]:
numeric_features = ["4046","4225","4770","small_bags","large_bags","xlarge_bags"]
numeric_transformer = Pipeline(
    steps=[
        ("dim reduction", PCA()),
        ("scaler", StandardScaler())
        #("polynomial", PolynomialFeatures(2)),
        # ("scaler 2", StandardScaler()) 
    ]
)

categorical_features = ["type", "geography", "year", "week"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder())
    ]
)

linear_regression_processor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

linear_regression_pipeline = Pipeline(
    steps=[
        ("preprocessor", linear_regression_processor), 
        ("linear regression", LinearRegression())
    ]
)

from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

ridge_regression_pipeline = Pipeline(
    steps=[
        ("preprocessor", linear_regression_processor), 
        ("ridge regression", RidgeCV(alphas = [0.0001, 0.001,0.01, 0.1, 1, 3, 10]))
    ]
)

lasso_regression_pipeline = Pipeline(
    steps=[
        ("preprocessor", linear_regression_processor), 
        ("ridge regression", LassoCV(alphas = [0.0001, 0.001,0.01, 0.1, 1, 3, 10]))
    ]
)

linear_regression_pipeline

In [8]:
#print("Features shape: ", X.shape)
#print("Target shape: ", y.shape)

#X_org_train, X_org_test, y_org_train, y_org_test = train_test_split(
#    X, y,
#    test_size=0.33
#)
#
#print("Training features shape: ", X_train.shape)
#print("Test features shape: ", X_test.shape)

In [9]:
# Imputed values split

#X_imp_train, X_imp_test, y_imp_train, y_imp_test = train_test_split(
#    X, y,
#    test_size=0.33
#)

In [10]:
# Imputed values included
print("Features shape: ", X.shape)
print("Target shape: ", y.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.33
)

print("Training features shape: ", X_train.shape)
print("Test features shape: ", X_test.shape)

Features shape:  (33477, 13)
Target shape:  (33477,)
Training features shape:  (22429, 13)
Test features shape:  (11048, 13)


In [11]:
pipelines = {
    "Decision Tree Max Depth": decision_tree_pipeline, 
    "Decision Tree Depth=20": decision_tree_pipeline_depth_20,
    "Linear regression": linear_regression_pipeline,
    "Lasso regression": lasso_regression_pipeline,
    "Ridge regression": ridge_regression_pipeline
}

for pipeline_name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)

In [12]:
from sklearn.metrics import root_mean_squared_error

linear_regression_pipeline.fit(X_train, y_train)
y_train_predictions = linear_regression_pipeline.predict(X_train)

linear_regression_rmse = root_mean_squared_error(y_train, y_train_predictions)
print("Average unit avocado price:", round(y.mean(), 2),"$")
print("Average deviation between real and estimated (RMSE):", round(linear_regression_rmse, 2),"$")

decision_tree_pipeline.fit(X_train, y_train)
y_train_predictions = decision_tree_pipeline.predict(X_train)

decision_tree_rmse = root_mean_squared_error(y_train, y_train_predictions)
print("Average unit avocado price:", round(y.mean(), 2),"$")
print("Average deviation between real and estimated (Using RMSE):", round(decision_tree_rmse, 2),"$")

Average unit avocado price: 1.38 $
Average deviation between real and estimated (RMSE): 0.24 $
Average unit avocado price: 1.38 $
Average deviation between real and estimated (Using RMSE): 0.0 $


In [13]:
from sklearn.model_selection import cross_val_score

for pipeline_name, pipeline in pipelines.items():
    
    pipeline_rmse_scores = -cross_val_score(
        pipeline, 
        X_train,
        y_train,
        scoring="neg_root_mean_squared_error", 
        cv=5
    )   

    print(f"RMSE Scores for {pipeline.steps[1][1]}")
    print(pipeline_rmse_scores)

RMSE Scores for DecisionTreeRegressor()
[0.19 0.19 0.19 0.19 0.19]
RMSE Scores for DecisionTreeRegressor(max_depth=20)
[0.19 0.19 0.19 0.19 0.19]
RMSE Scores for LinearRegression()
[0.24 0.24 0.24 0.24 0.24]
RMSE Scores for LassoCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 3, 10])
[0.24 0.24 0.24 0.24 0.24]
RMSE Scores for RidgeCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 3, 10])
[0.24 0.24 0.24 0.24 0.24]


In [14]:
# print(tabulate(result_scores, headers='keys', tablefmt='psql'))