In [1]:
import numpy as np
import pandas as pd
# Data preparation and Evaluation
import os
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
# Models
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer
# Model evaluation
from sklearn.model_selection import cross_val_score
# Encoders
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# Dimensionality reduction
from sklearn.decomposition import PCA
# Data visualization
from tabulate import tabulate

np.set_printoptions(precision=2, suppress=True)
np.random.seed(42)

pd.options.display.float_format = '{:.2f}'.format

In [2]:
input_path = "../data/slices/"


# df = pd.read_csv(input_path + "all_volume_components.csv")
# df = pd.read_csv(input_path + "total_volume_only.csv")
# df = pd.read_csv(input_path + "all_volume_components_year.csv")
df = pd.read_csv(input_path + "all_volume_components_year_week.csv")

df = df[df["type"] == "conventional"]

print("Sample from CSV read")
print(tabulate(df.sample(5), headers='keys', tablefmt='psql'))

df.drop(columns=["type"], inplace=True)

# Splitting features and target
X, y = df.drop('average_price', axis=1), df['average_price']

Sample from CSV read
+-------+-----------------+----------+----------+---------+--------------+--------------+---------------+--------------+-------------+--------+--------+
|       |   average_price |     4046 |     4225 |    4770 |   small_bags |   large_bags |   xlarge_bags | type         | geography   |   year |   week |
|-------+-----------------+----------+----------+---------+--------------+--------------+---------------+--------------+-------------+--------+--------|
| 19823 |            0.94 | 422469   |  37237.6 |  134.46 |      92617.3 |    121676    |        183.33 | conventional | Orlando     |   2018 |     26 |
| 32783 |            0.8  | 219225   |  15110.3 |   64.29 |     252423   |     76083.1  |       1580    | conventional | Orlando     |   2020 |     46 |
| 12563 |            0.83 | 663134   | 182629   | 5896.58 |     203296   |     19565.2  |          0    | conventional | Houston     |   2017 |     12 |
| 27839 |            1.09 |  62129.2 |  90102.9 |  816.5  |  

In [3]:
print("Features shape: ", X.shape)
print("Target shape: ", y.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.33
)

print("Training features shape: ", X_train.shape)
print("Test features shape: ", X_test.shape)

Features shape:  (16524, 9)
Target shape:  (16524,)
Training features shape:  (11071, 9)
Test features shape:  (5453, 9)


#### Transformer pipelines

In [4]:
numeric_features_with_year_week = ["4046","4225","4770","small_bags","large_bags","xlarge_bags", "year", "week"]
numeric_features_without_year_week = ["4046","4225","4770","small_bags","large_bags","xlarge_bags"]

categorical_features_without_year_week = ["geography"]
categorical_features_with_year_week = ["geography", "year", "week"]


numeric_transformer_no_scaling = Pipeline(
    steps=[
        ("dim reduction", PCA()),
        ("scaler", StandardScaler(with_mean=False, with_std=False)) # Not scaling
    ]
)

numeric_transformer = Pipeline(
    steps=[
        ("dim reduction", PCA()),
        ("scaler", StandardScaler())
        #("polynomial", PolynomialFeatures(2)),
        # ("scaler 2", StandardScaler()) 
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder())
    ]
)


In [5]:
processor_year_week_as_numeric = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_no_scaling, numeric_features_with_year_week),
        ("cat", categorical_transformer, categorical_features_without_year_week),
    ]
)

In [6]:
processor_year_week_as_category = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features_without_year_week),
        ("cat", categorical_transformer, categorical_features_with_year_week),
    ]
)

#### Decision Tree

In [7]:
decision_tree_pipeline = Pipeline(
    steps=[
        ("preprocessor", processor_year_week_as_numeric), 
        ("decision_tree", tree.DecisionTreeRegressor())
    ]
)

decision_tree_pipeline

#### Linear Regression

In [8]:
least_squared_regression_pipeline = Pipeline(
    steps=[
        ("preprocessor", processor_year_week_as_category), 
        ("linear regression", LinearRegression())
    ]
)

ridge_regression_pipeline = Pipeline(
    steps=[
        ("preprocessor", processor_year_week_as_category), 
        ("ridge regression", RidgeCV(alphas = [0.0001, 0.001,0.01, 0.1, 1, 3, 10]))
    ]
)

lasso_regression_pipeline = Pipeline(
    steps=[
        ("preprocessor", processor_year_week_as_category), 
        ("ridge regression", LassoCV(alphas = [0.0001, 0.001,0.01, 0.1, 1, 3, 10]))
    ]
)

least_squared_regression_pipeline

#### SVM pipeline

In [9]:
from sklearn.svm import SVR

svm_regression_pipeline = Pipeline(
    steps=[
        ("preprocessor", processor_year_week_as_category),
        ("SVM", SVR(C=5.0, epsilon=0.01)) # Changed based on grid search
    ]
)

svm_regression_pipeline

#### KNN pipeline

In [10]:
from sklearn.neighbors import KNeighborsRegressor

knn_regression_pipeline = Pipeline(
    steps=[
        ("preprocessor", processor_year_week_as_category), 
        ("KNN Regressor", KNeighborsRegressor(4, weights="distance"))
    ]
)

knn_regression_pipeline

#### Random Forest

In [11]:
from sklearn.ensemble import RandomForestRegressor

random_forest_pipeline = Pipeline(
    steps=[
        ("preprocessor", processor_year_week_as_numeric), 
        ("Random Forest Regressor", RandomForestRegressor(bootstrap=False, max_features=15)) # Changed based on grid search
    ]
)

random_forest_pipeline

#### Running pipelines

In [12]:
pipelines = {
    "Decision Tree": decision_tree_pipeline,
    "Least Squared": least_squared_regression_pipeline,
    # "Lasso": lasso_regression_pipeline,
    # "Ridge": ridge_regression_pipeline,
    "KNN": knn_regression_pipeline,
    "SVM": svm_regression_pipeline, # Very slow, takes 4min to run
    "Random Forest": random_forest_pipeline # Very slow, takes 9min to run
}

for pipeline_name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)

In [13]:
cv_rmse_scores = {}

for pipeline_name, pipeline in pipelines.items():
    
    pipeline_rmse_scores = -cross_val_score(
        pipeline, 
        X_train,
        y_train,
        scoring="neg_root_mean_squared_error", 
        cv=6
    )   
    
    pipeline_rmse_scores = map(lambda x: f"$ {x:.2f}", pipeline_rmse_scores)
    cv_rmse_scores.update({pipeline.steps[1][1]: pipeline_rmse_scores})

cv_scores_df = pd.DataFrame.from_dict(cv_rmse_scores, orient="index", columns=["Fold 1", "Fold 2", "Fold 3", "Fold 4", "Fold 5", "Fold 6"])
cv_scores_df

found 0 physical cores < 1
  File "c:\Users\alber\anaconda3\envs\experimental\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Fold 6
DecisionTreeRegressor(),$ 0.14,$ 0.15,$ 0.15,$ 0.14,$ 0.15,$ 0.15
LinearRegression(),$ 0.16,$ 0.17,$ 0.16,$ 0.16,$ 0.16,$ 0.16
"KNeighborsRegressor(n_neighbors=4, weights='distance')",$ 0.13,$ 0.13,$ 0.13,$ 0.13,$ 0.12,$ 0.13
"SVR(C=5.0, epsilon=0.01)",$ 0.09,$ 0.10,$ 0.09,$ 0.09,$ 0.10,$ 0.10
"(DecisionTreeRegressor(max_features=15, random_state=38791312), DecisionTreeRegressor(max_features=15, random_state=460887550), DecisionTreeRegressor(max_features=15, random_state=727694878), DecisionTreeRegressor(max_features=15, random_state=496563108), DecisionTreeRegressor(max_features=15, random_state=1308396028), DecisionTreeRegressor(max_features=15, random_state=1242188682), DecisionTreeRegressor(max_features=15, random_state=1952062683), DecisionTreeRegressor(max_features=15, random_state=1928813188), DecisionTreeRegressor(max_features=15, random_state=403859007), DecisionTreeRegressor(max_features=15, random_state=1617983563), DecisionTreeRegressor(max_features=15, random_state=1211606658), DecisionTreeRegressor(max_features=15, random_state=331215671), DecisionTreeRegressor(max_features=15, random_state=569881918), DecisionTreeRegressor(max_features=15, random_state=370540392), DecisionTreeRegressor(max_features=15, random_state=937944101), DecisionTreeRegressor(max_features=15, random_state=1541050933), DecisionTreeRegressor(max_features=15, random_state=633665153), DecisionTreeRegressor(max_features=15, random_state=1769737945), DecisionTreeRegressor(max_features=15, random_state=1816909313), DecisionTreeRegressor(max_features=15, random_state=1439833333), DecisionTreeRegressor(max_features=15, random_state=21218443), DecisionTreeRegressor(max_features=15, random_state=57519911), DecisionTreeRegressor(max_features=15, random_state=1227375873), DecisionTreeRegressor(max_features=15, random_state=992422504), DecisionTreeRegressor(max_features=15, random_state=606959996), DecisionTreeRegressor(max_features=15, random_state=81636175), DecisionTreeRegressor(max_features=15, random_state=200286203), DecisionTreeRegressor(max_features=15, random_state=1994980829), DecisionTreeRegressor(max_features=15, random_state=1246258360), DecisionTreeRegressor(max_features=15, random_state=1723646254), DecisionTreeRegressor(max_features=15, random_state=964310509), DecisionTreeRegressor(max_features=15, random_state=963420915), DecisionTreeRegressor(max_features=15, random_state=1141690531), DecisionTreeRegressor(max_features=15, random_state=135386755), DecisionTreeRegressor(max_features=15, random_state=401646794), DecisionTreeRegressor(max_features=15, random_state=1861476340), DecisionTreeRegressor(max_features=15, random_state=147660183), DecisionTreeRegressor(max_features=15, random_state=878027985), DecisionTreeRegressor(max_features=15, random_state=2139854682), DecisionTreeRegressor(max_features=15, random_state=1951905892), DecisionTreeRegressor(max_features=15, random_state=1491103165), DecisionTreeRegressor(max_features=15, random_state=1975591809), DecisionTreeRegressor(max_features=15, random_state=724792078), DecisionTreeRegressor(max_features=15, random_state=729307082), DecisionTreeRegressor(max_features=15, random_state=1451499482), DecisionTreeRegressor(max_features=15, random_state=1092591607), DecisionTreeRegressor(max_features=15, random_state=1616533040), DecisionTreeRegressor(max_features=15, random_state=1081858415), DecisionTreeRegressor(max_features=15, random_state=174288110), DecisionTreeRegressor(max_features=15, random_state=657617622), DecisionTreeRegressor(max_features=15, random_state=1048939810), DecisionTreeRegressor(max_features=15, random_state=1699735923), DecisionTreeRegressor(max_features=15, random_state=412789597), DecisionTreeRegressor(max_features=15, random_state=670049710), DecisionTreeRegressor(max_features=15, random_state=1582885143), DecisionTreeRegressor(max_features=15, random_state=1410334162), DecisionTreeRegressor(max_features=15, random_state=412543291), DecisionTreeRegressor(max_features=15, random_state=1252116444), DecisionTreeRegressor(max_features=15, random_state=56607820), DecisionTreeRegressor(max_features=15, random_state=952132399), DecisionTreeRegressor(max_features=15, random_state=2016047254), DecisionTreeRegressor(max_features=15, random_state=1542307211), DecisionTreeRegressor(max_features=15, random_state=993301488), DecisionTreeRegressor(max_features=15, random_state=525293039), DecisionTreeRegressor(max_features=15, random_state=1965106472), DecisionTreeRegressor(max_features=15, random_state=931295900), DecisionTreeRegressor(max_features=15, random_state=220806853), DecisionTreeRegressor(max_features=15, random_state=1566949298), DecisionTreeRegressor(max_features=15, random_state=816168831), DecisionTreeRegressor(max_features=15, random_state=1534726550), DecisionTreeRegressor(max_features=15, random_state=611313950), DecisionTreeRegressor(max_features=15, random_state=1939439084), DecisionTreeRegressor(max_features=15, random_state=779664276), DecisionTreeRegressor(max_features=15, random_state=326368791), DecisionTreeRegressor(max_features=15, random_state=907107783), DecisionTreeRegressor(max_features=15, random_state=158093803), DecisionTreeRegressor(max_features=15, random_state=1270014984), DecisionTreeRegressor(max_features=15, random_state=638588821), DecisionTreeRegressor(max_features=15, random_state=2058011779), DecisionTreeRegressor(max_features=15, random_state=904766540), DecisionTreeRegressor(max_features=15, random_state=156134113), DecisionTreeRegressor(max_features=15, random_state=617887874), DecisionTreeRegressor(max_features=15, random_state=196838372), DecisionTreeRegressor(max_features=15, random_state=1159690103), DecisionTreeRegressor(max_features=15, random_state=1231824283), DecisionTreeRegressor(max_features=15, random_state=1543918681), DecisionTreeRegressor(max_features=15, random_state=1074317809), DecisionTreeRegressor(max_features=15, random_state=770657908), DecisionTreeRegressor(max_features=15, random_state=20470381), DecisionTreeRegressor(max_features=15, random_state=878678904), DecisionTreeRegressor(max_features=15, random_state=1703428449), DecisionTreeRegressor(max_features=15, random_state=51448078), DecisionTreeRegressor(max_features=15, random_state=2083501447), DecisionTreeRegressor(max_features=15, random_state=1504620975), DecisionTreeRegressor(max_features=15, random_state=1042834146), DecisionTreeRegressor(max_features=15, random_state=394037719), DecisionTreeRegressor(max_features=15, random_state=286160627), DecisionTreeRegressor(max_features=15, random_state=1380783411), DecisionTreeRegressor(max_features=15, random_state=1821020667), DecisionTreeRegressor(max_features=15, random_state=550658102))",$ 0.10,$ 0.10,$ 0.09,$ 0.09,$ 0.09,$ 0.09


#### Grid search SVM

In [14]:
from sklearn.model_selection import GridSearchCV

# param_grid_svm = {
#     'kernel': ['sigmoid', 'linear', 'poly', 'rbf'],
#     'C': [1,5,10], 
#     'degree': [3,8],
#     'epsilon': [0.01, 0.1, 1, 10]
# }

param_grid_svm = {
    'kernel': ['rbf'],
    'C': [5], 
    'degree': [3],
    'epsilon': [0.01]
}

svm_grid_search = GridSearchCV(
                SVR(), 
                param_grid_svm, 
                cv=3,
                scoring='neg_root_mean_squared_error',
                n_jobs = -1
            )


svm_grid_search.fit(processor_year_week_as_category.fit_transform(X_train), y_train)

In [15]:
svm_grid_search.best_params_

{'C': 5, 'degree': 3, 'epsilon': 0.01, 'kernel': 'rbf'}

In [16]:
best_svm = svm_grid_search.best_estimator_
best_svm

#### Grid search Random Forest

In [17]:
# param_grid_random_forest = { 
#     'max_features': [15, 12, 1],
#     'max_depth': [15, 20, None],
#     'n_estimators': [3, 30, 100], 
#     'bootstrap': [False, True]
# }

# param_grid_random_forest = { 
#     'max_features': [1],
#     'max_depth': [None],
#     'n_estimators': [100], 
#     'bootstrap': [False]
# }

param_grid_random_forest = { 
    'max_features': [15],
    'max_depth': [None],
    'n_estimators': [100], 
    'bootstrap': [False]
}

random_forest_grid_search = GridSearchCV(
                RandomForestRegressor(n_jobs = -1), 
                param_grid_random_forest, 
                cv=3,
                scoring='neg_root_mean_squared_error',
                n_jobs = -1
            )

random_forest_grid_search.fit(processor_year_week_as_numeric.fit_transform(X_train), y_train)

In [18]:
random_forest_grid_search.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 15,
 'n_estimators': 100}

In [19]:
best_random_forest = random_forest_grid_search.best_estimator_
best_random_forest

#### Errors

In [20]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

models_errors = {}

print("Errors on training data \n")

for pipeline_name, pipeline in pipelines.items():
        pipeline.fit(X_train, y_train)
        y_predicted = pipeline.predict(X_test)

        mae = mean_absolute_error(y_test, y_predicted)
        mae_percentage = mean_absolute_percentage_error(y_test, y_predicted)
        mse = mean_squared_error(y_test, y_predicted)
        rmse = root_mean_squared_error(y_test, y_predicted)
        r2 = r2_score(y_test, y_predicted)

        errors = {pipeline.steps[1][1]: [
                f"${mae:.2f}", 
                f"{mae_percentage * 100:.2f}%", 
                mse, 
                f"${rmse:.2f}", 
                r2
        ]}
        models_errors.update(errors)
        
models_errors_df = pd.DataFrame.from_dict(models_errors, orient="index", columns=["MAE", "MAE %", "MSE", "RMSE", "R2"])
models_errors_df

Errors on training data 



Unnamed: 0,MAE,MAE %,MSE,RMSE,R2
DecisionTreeRegressor(),$0.10,8.78%,0.02,$0.14,0.67
LinearRegression(),$0.13,11.25%,0.03,$0.16,0.58
"KNeighborsRegressor(n_neighbors=4, weights='distance')",$0.09,8.09%,0.02,$0.12,0.76
"SVR(C=5.0, epsilon=0.01)",$0.07,6.11%,0.01,$0.09,0.86
"(DecisionTreeRegressor(max_features=15, random_state=1891862850), DecisionTreeRegressor(max_features=15, random_state=1734976772), DecisionTreeRegressor(max_features=15, random_state=1532937827), DecisionTreeRegressor(max_features=15, random_state=708028091), DecisionTreeRegressor(max_features=15, random_state=1015097742), DecisionTreeRegressor(max_features=15, random_state=912085328), DecisionTreeRegressor(max_features=15, random_state=608820523), DecisionTreeRegressor(max_features=15, random_state=995905654), DecisionTreeRegressor(max_features=15, random_state=1112057389), DecisionTreeRegressor(max_features=15, random_state=1886267804), DecisionTreeRegressor(max_features=15, random_state=1374569027), DecisionTreeRegressor(max_features=15, random_state=342474111), DecisionTreeRegressor(max_features=15, random_state=1816690143), DecisionTreeRegressor(max_features=15, random_state=525891888), DecisionTreeRegressor(max_features=15, random_state=1850859178), DecisionTreeRegressor(max_features=15, random_state=222826341), DecisionTreeRegressor(max_features=15, random_state=2081759427), DecisionTreeRegressor(max_features=15, random_state=1642406220), DecisionTreeRegressor(max_features=15, random_state=926256147), DecisionTreeRegressor(max_features=15, random_state=1690800750), DecisionTreeRegressor(max_features=15, random_state=920691345), DecisionTreeRegressor(max_features=15, random_state=2142855871), DecisionTreeRegressor(max_features=15, random_state=146068912), DecisionTreeRegressor(max_features=15, random_state=1287639922), DecisionTreeRegressor(max_features=15, random_state=1361077673), DecisionTreeRegressor(max_features=15, random_state=1376339172), DecisionTreeRegressor(max_features=15, random_state=2047713803), DecisionTreeRegressor(max_features=15, random_state=228370648), DecisionTreeRegressor(max_features=15, random_state=1219507165), DecisionTreeRegressor(max_features=15, random_state=127843918), DecisionTreeRegressor(max_features=15, random_state=1847560566), DecisionTreeRegressor(max_features=15, random_state=157589539), DecisionTreeRegressor(max_features=15, random_state=997696263), DecisionTreeRegressor(max_features=15, random_state=552976411), DecisionTreeRegressor(max_features=15, random_state=1185815923), DecisionTreeRegressor(max_features=15, random_state=1331462229), DecisionTreeRegressor(max_features=15, random_state=288516283), DecisionTreeRegressor(max_features=15, random_state=1124952368), DecisionTreeRegressor(max_features=15, random_state=880286461), DecisionTreeRegressor(max_features=15, random_state=1121021349), DecisionTreeRegressor(max_features=15, random_state=2037119055), DecisionTreeRegressor(max_features=15, random_state=1856688313), DecisionTreeRegressor(max_features=15, random_state=829731844), DecisionTreeRegressor(max_features=15, random_state=976982573), DecisionTreeRegressor(max_features=15, random_state=263921505), DecisionTreeRegressor(max_features=15, random_state=2110184944), DecisionTreeRegressor(max_features=15, random_state=1576151548), DecisionTreeRegressor(max_features=15, random_state=287553125), DecisionTreeRegressor(max_features=15, random_state=190649290), DecisionTreeRegressor(max_features=15, random_state=1675817985), DecisionTreeRegressor(max_features=15, random_state=1585426648), DecisionTreeRegressor(max_features=15, random_state=1675738648), DecisionTreeRegressor(max_features=15, random_state=2142814464), DecisionTreeRegressor(max_features=15, random_state=1386753758), DecisionTreeRegressor(max_features=15, random_state=908211652), DecisionTreeRegressor(max_features=15, random_state=1777698215), DecisionTreeRegressor(max_features=15, random_state=743616835), DecisionTreeRegressor(max_features=15, random_state=1108033010), DecisionTreeRegressor(max_features=15, random_state=36148804), DecisionTreeRegressor(max_features=15, random_state=1721428164), DecisionTreeRegressor(max_features=15, random_state=368955895), DecisionTreeRegressor(max_features=15, random_state=1423805711), DecisionTreeRegressor(max_features=15, random_state=1030439605), DecisionTreeRegressor(max_features=15, random_state=2040722366), DecisionTreeRegressor(max_features=15, random_state=375334929), DecisionTreeRegressor(max_features=15, random_state=913225834), DecisionTreeRegressor(max_features=15, random_state=1719447056), DecisionTreeRegressor(max_features=15, random_state=1626976380), DecisionTreeRegressor(max_features=15, random_state=2032263650), DecisionTreeRegressor(max_features=15, random_state=609816145), DecisionTreeRegressor(max_features=15, random_state=1534979075), DecisionTreeRegressor(max_features=15, random_state=307202674), DecisionTreeRegressor(max_features=15, random_state=1835116894), DecisionTreeRegressor(max_features=15, random_state=105398622), DecisionTreeRegressor(max_features=15, random_state=396829320), DecisionTreeRegressor(max_features=15, random_state=621489950), DecisionTreeRegressor(max_features=15, random_state=411507588), DecisionTreeRegressor(max_features=15, random_state=161603811), DecisionTreeRegressor(max_features=15, random_state=2044594191), DecisionTreeRegressor(max_features=15, random_state=664819497), DecisionTreeRegressor(max_features=15, random_state=378397096), DecisionTreeRegressor(max_features=15, random_state=1750493647), DecisionTreeRegressor(max_features=15, random_state=1829155539), DecisionTreeRegressor(max_features=15, random_state=2118556917), DecisionTreeRegressor(max_features=15, random_state=1402095389), DecisionTreeRegressor(max_features=15, random_state=898880281), DecisionTreeRegressor(max_features=15, random_state=605565104), DecisionTreeRegressor(max_features=15, random_state=1837476382), DecisionTreeRegressor(max_features=15, random_state=8940028), DecisionTreeRegressor(max_features=15, random_state=1539360305), DecisionTreeRegressor(max_features=15, random_state=2010359438), DecisionTreeRegressor(max_features=15, random_state=1267175987), DecisionTreeRegressor(max_features=15, random_state=996772072), DecisionTreeRegressor(max_features=15, random_state=1859325605), DecisionTreeRegressor(max_features=15, random_state=1791425573), DecisionTreeRegressor(max_features=15, random_state=304122346), DecisionTreeRegressor(max_features=15, random_state=468214776), DecisionTreeRegressor(max_features=15, random_state=1417003451), DecisionTreeRegressor(max_features=15, random_state=1022576659), DecisionTreeRegressor(max_features=15, random_state=601058914))",$0.06,5.92%,0.01,$0.09,0.87


#### Results 

In [21]:
print("CV scores (Train data, RMSE)\n")
print("Avocado average_price on train data:", "$", round(y_train.mean(), 3), "\n")
print(tabulate(cv_scores_df, headers='keys', tablefmt='github', floatfmt=".2f"), "\n")

print("Errors (Test split 0.33)\n")
print("Avocado average_price on test data:", "$", round(y_test.mean(), 3), "\n")
print(tabulate(models_errors_df, headers='keys', tablefmt='github', floatfmt=".2f"), "\n")

CV scores (Train data, RMSE)

Avocado average_price on train data: $ 1.146 

|                                                         | Fold 1   | Fold 2   | Fold 3   | Fold 4   | Fold 5   | Fold 6   |
|---------------------------------------------------------|----------|----------|----------|----------|----------|----------|
| DecisionTreeRegressor()                                 | $ 0.14   | $ 0.15   | $ 0.15   | $ 0.14   | $ 0.15   | $ 0.15   |
| LinearRegression()                                      | $ 0.16   | $ 0.17   | $ 0.16   | $ 0.16   | $ 0.16   | $ 0.16   |
| KNeighborsRegressor(n_neighbors=4, weights='distance')  | $ 0.13   | $ 0.13   | $ 0.13   | $ 0.13   | $ 0.12   | $ 0.13   |
| SVR(C=5.0, epsilon=0.01)                                | $ 0.09   | $ 0.10   | $ 0.09   | $ 0.09   | $ 0.10   | $ 0.10   |
| RandomForestRegressor(bootstrap=False, max_features=15) | $ 0.10   | $ 0.10   | $ 0.09   | $ 0.09   | $ 0.09   | $ 0.09   | 

Errors (Test split 0.33)

Avocado avera

In [22]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

df = pd.read_csv("../data/slices/plot.csv")

mask1 = df['location_level'] == "Level 1"
mask2 = df['type'] == 'conventional'

df_plot = df[mask1 & mask2].copy()

fig = make_subplots(rows=1, cols=1, subplot_titles=("Original","Predicted"))

fig.append_trace(go.Scatter(
    x=df_plot['date'],
    y=df_plot['average_price'],
    name="Average Price Real"
), row=1, col=1)


for pipeline_name, pipeline in pipelines.items():
    
    y_predict = pipeline.predict(X)
    df_plot2 = df[mask2].copy()
    df_plot2['average_price_predicted'] = y_predict
    
    df_plot2 = df_plot2[mask1]

    fig.append_trace(go.Scatter(
        x=df_plot2['date'],
        y=df_plot2['average_price_predicted'],
        name=f"{pipeline.steps[1][1]}"
    ), row=1, col=1)

fig.update_layout(height=600, title_text="Stacked Subplots")
fig.show()


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



### Avocado type statistics

In [23]:
df = pd.read_csv("../data/slices/plot.csv")

df_type_stats = df.copy()

df_type_stats['volume (millions)'] = (df["4046"] + df["4225"] + df["4770"] + df["small_bags"] + df["large_bags"] + df["xlarge_bags"])/1000000

mask1 = df['location_level'] == "Level 2"
pivot_df = df_type_stats[mask1].pivot_table(index=["geography"], columns=['type'], values=['average_price', 'volume (millions)'], aggfunc='mean')

df_type_stats.drop(columns=[
    "year", "week", "geography", "4046", "4225", "4770", "small_bags", "large_bags", "xlarge_bags",
    "location_lat", "location_lon"], inplace=True)



conventional_stats = df_type_stats[df_type_stats["type"] == "conventional"].describe(percentiles=[.5, .75, .95, .99])
conventional_stats.rename(columns={'average_price': '$ conventional'}, inplace=True)
conventional_stats.rename(columns={'volume (millions)': 'conventional vol. (millions)'}, inplace=True)

organic_stats = df_type_stats[df_type_stats["type"] == "organic"].describe(percentiles=[.5, .75, .95, .99])
organic_stats.rename(columns={'average_price': '$ organic'}, inplace=True)
organic_stats.rename(columns={'volume (millions)': 'organic vol. (millions)'}, inplace=True)

In [24]:
df_type_stats_describe = pd.concat([conventional_stats, organic_stats], axis=1).T
new_order = ['$ conventional', '$ organic', 'conventional vol. (millions)', 'organic vol. (millions)']
df_type_stats_describe = df_type_stats_describe.reindex(new_order)

print("Avocado distribution per type for regions (Level 2)")
df_type_stats_describe

Avocado distribution per type for regions (Level 2)


Unnamed: 0,count,mean,std,min,50%,75%,95%,99%,max
$ conventional,16524.0,1.14,0.25,0.46,1.12,1.3,1.59,1.8,2.22
$ organic,16521.0,1.62,0.34,0.44,1.58,1.82,2.21,2.59,3.25
conventional vol. (millions),16524.0,1.87,5.41,0.03,0.48,1.14,6.48,36.78,63.72
organic vol. (millions),16521.0,0.06,0.19,0.0,0.02,0.04,0.23,1.21,2.39


In [25]:
pivot_df

Unnamed: 0_level_0,average_price,average_price,volume (millions),volume (millions)
type,conventional,organic,conventional,organic
geography,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
California,1.15,1.74,6.31,0.2
Great Lakes,1.15,1.47,3.81,0.17
Midsouth,1.17,1.57,3.43,0.15
Northeast,1.31,1.77,4.83,0.21
Plains,1.13,1.57,2.02,0.06
South Central,0.86,1.35,6.65,0.13
Southeast,1.12,1.54,4.42,0.09
West,1.02,1.62,6.76,0.26


In [26]:
# print(tabulate(df_type_stats_describe, headers='keys', tablefmt='github', floatfmt=".2f"), "\n")
# print(tabulate(pivot_df, headers='keys', tablefmt='github', floatfmt=".2f"))