In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import mlflow

In [3]:
mlflow.autolog()

2023/10/06 19:09:57 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [4]:
df = pd.read_parquet('data/refined/dataset.parquet')

In [5]:
df.head()

Unnamed: 0,asin,brand,price,sales,bsr,fba_fees,active_sellers,ratings,review_count,images,review_velocity,category,top_selling_brand
0,B00NW479QO,Lindt,3.5,13466.0,3.0,2.62,30.0,5.0,44069.0,12.0,868.0,Food Cupboard,True
1,B004OCO20E,Andrex,18.28,13338.0,2.0,8.38,30.0,5.0,40397.0,5.0,830.0,Grocery,True
2,B0049NYI7K,BAILEYS,9.99,11194.0,4.0,3.93,22.0,5.0,3827.0,6.0,235.0,"Beer, Wine & Spirits",True
3,B016DEGFWI,Tassimo,14.87,13492.0,2.0,5.19,11.0,5.0,28800.0,5.0,466.0,Fresh & Chilled,True
4,B07JVF2FCV,Tassimo,45.0,13377.0,6.0,11.03,1.0,5.0,27494.0,9.0,223.0,Drinks,True


In [6]:
df_model = df.drop(columns = ['asin','brand'])

df_model = pd.concat([df_model,pd.get_dummies(df_model['category'])], axis = 1)

X = df_model.drop(columns = ['sales','category'])

y = df_model['sales']

In [7]:
lr = LinearRegression()
en = ElasticNet()
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()

models = [lr,en,dt,rf]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 123)

In [9]:
en_params = {
    'alpha': [0.1, 1],
    'l1_ratio': [0, 0.5, 1],
}

dt_params = {
                'max_depth': [3, 5, 10],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 3, 5]
            }

rf_params = {
                'n_estimators': [100, 200, 500],
                'max_depth': [3, 5, 10],
                'min_samples_split': [2, 5, 10],
            }


# Baseline Model - Linear Regression

In [18]:
with mlflow.start_run() as run:

    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_test)

    mse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metrics({'RMSE':mse,
                       'MAE':mae,
                       'R2':r2})

    print('Test RMSE',mse)
    print('Test MAE', mae)
    print('Test R2', r2)



Test RMSE 1229.6854779511893
Test MAE 540.7653672163756
Test R2 0.4933493760092217


# Other Models and Hyperparameter Optimization

In [19]:
model_list = [(en, en_params),
              (dt, dt_params),
              (rf, rf_params)]

In [20]:
for model, pg in model_list:
    with mlflow.start_run() as run:
            
            cv = GridSearchCV(model,
                            param_grid = pg,
                            n_jobs = -1,
                            verbose=1)
            
            cv.fit(X_train, y_train)

            y_pred = cv.predict(X_test)

            mse = mean_squared_error(y_test, y_pred, squared=False)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            mlflow.log_metrics({'RMSE':mse,
                                'MAE':mae,
                                'R2':r2})



Fitting 5 folds for each of 6 candidates, totalling 30 fits


2023/10/06 21:30:25 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.


Fitting 5 folds for each of 27 candidates, totalling 135 fits


2023/10/06 21:30:34 INFO mlflow.sklearn.utils: Logging the 5 best runs, 22 runs will be omitted.


Fitting 5 folds for each of 27 candidates, totalling 135 fits


2023/10/06 21:32:36 INFO mlflow.sklearn.utils: Logging the 5 best runs, 22 runs will be omitted.
