In [184]:
import numpy as np
import pandas as pd
# Data preparation and Evaluation
import os
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
# Models
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer
# Model evaluation
from sklearn.model_selection import cross_val_score
# Encoders
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# Data visualization
from tabulate import tabulate
import time

np.set_printoptions(precision=2, suppress=True)
np.random.seed(42)

In [185]:
input_path = "../data/output/"

print("Reading CSV input")
df = pd.read_csv(input_path + "avocados_2.csv")

print("Sample from CSV read")
print(tabulate(df.sample(5), headers='keys', tablefmt='psql'))

# Shuffle the dataframe because it was originally ordered by date
# In this model, we removed the date from the features, so we need to reshuffle
# df = df.sample(frac=1)

# Splitting features and target
X, y = df.drop('average_price', axis=1), df['average_price']

print("\nX sample is \n", tabulate(X.sample(5), headers='keys', tablefmt='psql'))

Reading CSV input
Sample from CSV read
+-------+-----------------+-----------+----------+---------+--------------+--------------+---------------+--------------+----------------------+
|       |   average_price |      4046 |     4225 |    4770 |   small_bags |   large_bags |   xlarge_bags | type         | geography            |
|-------+-----------------+-----------+----------+---------+--------------+--------------+---------------+--------------+----------------------|
| 15718 |            1.64 |      0    |   202.17 |    0    |     23598.8  |         0    |             0 | organic      | Northern New England |
| 28607 |            1.17 |   3238.72 | 41880.7  |   14.17 |     24371.6  |      8594.3  |          1810 | conventional | Syracuse             |
| 27406 |            1.85 |   7857    |  7205    |    0    |     39803    |        47    |             0 | organic      | San Francisco        |
| 19910 |            1.31 |     71.23 |   621.3  |    0    |      3176.17 |         0    | 

In [186]:
numeric_features = ["4046","4225","4770","small_bags","large_bags","xlarge_bags"]
numeric_transformer_no_scaling = Pipeline(
    steps=[
        ("scaler", StandardScaler(with_mean=False, with_std=False)) # Not scaling
    ]
)
numeric_transformer = Pipeline(
    steps=[
        ("polynomial", PolynomialFeatures(3)),
        ("scaler", StandardScaler()) 
    ]
)

categorical_features = ["type", "geography"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder())
    ]
)

In [187]:
decision_tree_preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_no_scaling, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

decision_tree_pipeline = Pipeline(
    steps=[
        ("preprocessor", decision_tree_preprocessor), 
        ("decision_tree", tree.DecisionTreeRegressor())
    ]
)

decision_tree_pipeline

In [188]:
linear_regression_processor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

linear_regression_pipeline = Pipeline(
    steps=[
        ("preprocessor", linear_regression_processor), 
        ("linear regression", LinearRegression())
    ]
)

# Scale data, Polinomial
# model = make_pipeline(PolynomialFeatures(2), LinearRegression())
# model.fit(X_train, y_train)
# linear_regression_model = 
# linear_regression_model.fit(X_train, y_train)
# result_scores.loc["Linear Regression"] = [
#         linear_regression_model.score(X_train, y_train),
#         linear_regression_model.score(X_test, y_test),
#         "Linear regression without scaling and poly"
#     ]

linear_regression_pipeline

In [189]:
print("Features shape: ", X.shape)
print("Target shape: ", y.shape)

print("\nFeatures sample: \n", X.sample(2), "\n")


print("Splitting training and test data")

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.33
)

print("Training features shape: ", X_train.shape)
print("Test features shape: ", X_test.shape)

print("\nTraining X sample: \n", X_train.sample(2), "\n")
print("\nTraining y sample: \n", y_train.sample(2), "\n")

Features shape:  (33045, 8)
Target shape:  (33045,)

Features sample: 
            4046       4225      4770  small_bags  large_bags  xlarge_bags  \
27298   6105.00    8285.00      0.00    37884.00       31.00          0.0   
10059  52200.02  324240.53  75841.97    61522.13     1047.85       8830.0   

               type      geography  
27298       organic  San Francisco  
10059  conventional        Chicago   

Splitting training and test data
Training features shape:  (22140, 8)
Test features shape:  (10905, 8)

Training X sample: 
            4046       4225     4770  small_bags  large_bags  xlarge_bags  \
3088   15791.25  237182.53  1472.69   103254.89    12272.63         0.00   
5377  114813.13  156347.74  9650.50   173537.66       35.61        43.39   

              type     geography  
3088  conventional  Philadelphia  
5377  conventional       Seattle   


Training y sample: 
 32674    1.53
25764    1.13
Name: average_price, dtype: float64 



In [190]:
columns_scores = [
    "Training score",
    "Test score"
]
result_scores = pd.DataFrame(columns=columns_scores)

In [191]:
pipelines = {
    "Decision Tree": decision_tree_pipeline, 
    "Linear regression": linear_regression_pipeline
}

for pipeline_name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    result_scores.loc[pipeline_name] = [
            f"{pipeline.score(X_train, y_train) * 100:.2f}%",
            f"{pipeline.score(X_test, y_test) * 100:.2f}%"
        ]

# TODO Cross-validation
# TODO Optimize Decision Tree + Random forest
# TODO Optimize Linear Regression
# TODO Move to expanded dataset

# TODO PCA on components of volume

# Cross validation
# scores = cross_val_score(
#     decision_tree_model,
#     X,
#     y,
#     cv=5,
#     scoring='mean_squared_error'
# )

In [192]:
print(tabulate(result_scores, headers='keys', tablefmt='psql'))
result_scores.to_csv(
    os.path.normpath(
        input_path +
        "model_1_results_" +
        time.strftime("%Y%m%d-%H%M%S") +
        ".csv"
    )
)

+-------------------+------------------+--------------+
|                   | Training score   | Test score   |
|-------------------+------------------+--------------|
| Decision Tree     | 100.00%          | 62.64%       |
| Linear regression | 54.92%           | 51.82%       |
+-------------------+------------------+--------------+
