### In this notebook a regression using RandomForest will be applied

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble._forest import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sqlalchemy import select
from sqlalchemy.orm import sessionmaker

from shipping_cost_analysis.etl.db_tools import get_engine
from shipping_cost_analysis.models.schema import Customer, Product, Region, Transactions

In [13]:
engine = get_engine()
Session = sessionmaker(bind=engine)

with engine.connect() as conn:
    query = (
        select(
            Transactions.transaction_date,
            Transactions.stock_code,
            Transactions.quantity,
            Transactions.unit_price,
            Transactions.sales,
            Customer.order_postal,
            Customer.order_state,
            Product.landed_cost,
            Product.shipping_cost_1000_r,
            Product.weight,
            Product.category,
            Region.region,
        )
        .join(Transactions, Transactions.stock_code == Product.stock_code)
        .join(Customer, Transactions.customer_id == Customer.customer_id)
        .join(Region, Region.order_state == Customer.order_state)
    )

    df = pd.read_sql(query, conn)

In [14]:
df.dtypes

transaction_date        datetime64[ns]
stock_code                      object
quantity                       float64
unit_price                     float64
sales                          float64
order_postal                    object
order_state                     object
landed_cost                    float64
shipping_cost_1000_r           float64
weight                         float64
category                        object
region                          object
dtype: object

**Preprocessing**

In [15]:
metric_features = df.select_dtypes(include=[np.number])
metric_features.columns.to_list()

['quantity',
 'unit_price',
 'sales',
 'landed_cost',
 'shipping_cost_1000_r',
 'weight']

In [16]:
# convert string features to categories for modelling

str_columns = ["category", "region", "order_state", "order_postal"]
df[str_columns] = df[str_columns].astype("category")

In [17]:
df["days_since_first_tx"] = (
    df["transaction_date"] - df["transaction_date"].min()
).dt.days
df = df.drop(columns="transaction_date")

In [None]:
df = pd.get_dummies(df, columns=["order_state", "region", "category"])
df = df.dropna()

**Split in train and test data frames**

In [None]:
# X, y splitten
X = df.drop("unit_price", axis=1)
y = df["unit_price"]

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)