In [40]:
import numpy as np
import pandas as pd
# Data preparation and Evaluation
import os
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
# Models
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer
# Model evaluation
from sklearn.model_selection import cross_val_score
# Encoders
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# Data visualization
from tabulate import tabulate
import time

np.set_printoptions(precision=2, suppress=True)
np.random.seed(42)

In [41]:
input_path = "../data/slices/"


# df = pd.read_csv(input_path + "all_volume_components.csv")
# df = pd.read_csv(input_path + "total_volume_only.csv")
# df = df[df["type"] == "organic"]
# df = pd.read_csv(input_path + "all_volume_components_year.csv")
df = pd.read_csv(input_path + "all_volume_components_year_week.csv")

print("Sample from CSV read")
print(tabulate(df.sample(5), headers='keys', tablefmt='psql'))

# Splitting features and target
X, y = df.drop('average_price', axis=1), df['average_price']

Sample from CSV read
+-------+-----------------+-----------+----------+---------+--------------+--------------+---------------+--------------+----------------------+--------+--------+
|       |   average_price |      4046 |     4225 |    4770 |   small_bags |   large_bags |   xlarge_bags | type         | geography            |   year |   week |
|-------+-----------------+-----------+----------+---------+--------------+--------------+---------------+--------------+----------------------+--------+--------|
| 15718 |            1.64 |      0    |   202.17 |    0    |     23598.8  |         0    |             0 | organic      | Northern New England |   2017 |     41 |
| 28607 |            1.17 |   3238.72 | 41880.7  |   14.17 |     24371.6  |      8594.3  |          1810 | conventional | Syracuse             |   2020 |      7 |
| 27406 |            1.85 |   7857    |  7205    |    0    |     39803    |        47    |             0 | organic      | San Francisco        |   2019 |     48 |
|

In [42]:
numeric_features = ["4046","4225","4770","small_bags","large_bags","xlarge_bags"]
# numeric_features = ["total_volume"]
numeric_transformer_no_scaling = Pipeline(
    steps=[
        ("scaler", StandardScaler(with_mean=False, with_std=False)) # Not scaling
    ]
)
numeric_transformer = Pipeline(
    steps=[
        # ("polynomial", PolynomialFeatures(2)),
        ("scaler", StandardScaler()) 
    ]
)

categorical_features = ["type", "geography", "year", "week"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder())
    ]
)

In [43]:
decision_tree_preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_no_scaling, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

decision_tree_pipeline = Pipeline(
    steps=[
        ("preprocessor", decision_tree_preprocessor), 
        ("decision_tree", tree.DecisionTreeRegressor())
    ]
)

decision_tree_pipeline

In [44]:
linear_regression_processor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

linear_regression_pipeline = Pipeline(
    steps=[
        ("preprocessor", linear_regression_processor), 
        ("linear regression", LinearRegression())
    ]
)

linear_regression_pipeline

In [45]:
print("Features shape: ", X.shape)
print("Target shape: ", y.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.33
)

print("Training features shape: ", X_train.shape)
print("Test features shape: ", X_test.shape)

Features shape:  (33045, 10)
Target shape:  (33045,)
Training features shape:  (22140, 10)
Test features shape:  (10905, 10)


In [46]:
columns_scores = [
    "Training score",
    "Test score"
]
result_scores = pd.DataFrame(columns=columns_scores)

In [47]:
pipelines = {
    "Decision Tree": decision_tree_pipeline, 
    "Linear regression": linear_regression_pipeline
}

for pipeline_name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    result_scores.loc[pipeline_name] = [
            f"{pipeline.score(X_train, y_train) * 100:.2f}%",
            f"{pipeline.score(X_test, y_test) * 100:.2f}%"
        ]

In [48]:
print(tabulate(result_scores, headers='keys', tablefmt='psql'))

+-------------------+------------------+--------------+
|                   | Training score   | Test score   |
|-------------------+------------------+--------------|
| Decision Tree     | 100.00%          | 68.41%       |
| Linear regression | 62.11%           | 62.40%       |
+-------------------+------------------+--------------+
