# Project 3: Munchies

In [1]:
import os
import numpy as np 
import pandas as pd 
import copy
import matplotlib.pyplot as plt

In [2]:
import sklearn.metrics as metrics
def regression_results(y_true, y_pred):
    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

### Loading in the data

In [3]:
BrandAverageRetailPrice_df = pd.read_csv("./data/BrandAverageRetailPrice.csv")
BrandDetails_df = pd.read_csv("./data/BrandDetails.csv")
BrandTotalSales_df = pd.read_csv("./data/BrandTotalSales.csv")
BrandTotalUnits_df = pd.read_csv("./data/BrandTotalUnits.csv")
Top50Products_df = pd.read_csv("./data/Top50ProductsbyTotalSales-Timeseries.csv")

### 1. Merge Datasets and Effectively Link information and 2. Develop basic Time Series Feature Extraction Plan and 3. Create additional data feature engineering plan and implement it (no need to pipeline this)

In [4]:
BrandTotalSales_df.columns = ["Months", "Brands", "Total Sales ($)"]

# Tidying up brand units data
BrandTotalUnits_df['Months'] = pd.to_datetime(BrandTotalUnits_df['Months'])
BrandTotalUnits_df['Total Units'] = BrandTotalUnits_df['Total Units'].str[:8]
BrandTotalUnits_df['Total Units'] = BrandTotalUnits_df['Total Units'].str.replace(',', '').astype(float)
BrandTotalUnits_df['Total Units'] = pd.to_numeric(BrandTotalUnits_df['Total Units'])

BrandTotalUnits_df.info()

# Tidying up brand total sales data
BrandTotalSales_df['Months'] = pd.to_datetime(BrandTotalSales_df['Months'])
BrandTotalSales_df['Total Sales ($)'] = BrandTotalSales_df['Total Sales ($)'].str[:8]
BrandTotalSales_df['Total Sales ($)'] = BrandTotalSales_df['Total Sales ($)'].str.replace(',', '')
BrandTotalSales_df['Total Sales ($)'] = pd.to_numeric(BrandTotalSales_df['Total Sales ($)'])

BrandTotalSales_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27686 entries, 0 to 27685
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Brands            27686 non-null  object        
 1   Months            27686 non-null  datetime64[ns]
 2   Total Units       25712 non-null  float64       
 3   vs. Prior Period  24935 non-null  float64       
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 865.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25279 entries, 0 to 25278
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Months           25279 non-null  datetime64[ns]
 1   Brands           25279 non-null  object        
 2   Total Sales ($)  25279 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 592.6+ KB


In [257]:
# master_df = BrandTotalUnits_df.merge(BrandTotalSales_df)
# master_df

In [5]:
brands = BrandTotalUnits_df["Brands"].unique()
# Building our main df that we will base our analysis on
main_df = BrandTotalUnits_df[BrandTotalUnits_df.Brands==brands[0]]
main_df.loc[:, 'Previous Month Total Units'] = main_df.loc[:, 'Total Units'].shift(1)
main_df.loc[:,'Rolling Average'] = (main_df.loc[:,'Total Units'].shift(1) + main_df.loc[:,'Total Units'].shift(2) + main_df.loc[:,'Total Units'].shift(3))/3

for i in range(1, len(brands)):
    temp_df = BrandTotalUnits_df[BrandTotalUnits_df.Brands==brands[i]]
    temp_df.loc[:, 'Previous Month Total Units'] = temp_df.loc[:, 'Total Units'].shift(1)
    temp_df.loc[:,'Rolling Average'] = (temp_df.loc[:,'Total Units'].shift(1) + temp_df.loc[:,'Total Units'].shift(2) + temp_df.loc[:,'Total Units'].shift(3))/3
    main_df = main_df.append(temp_df)
    
main_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,Brands,Months,Total Units,vs. Prior Period,Previous Month Total Units,Rolling Average
0,#BlackSeries,2020-08-01,1616.3300,,,
1,#BlackSeries,2020-09-01,,-1.000000,1616.3300,
2,#BlackSeries,2021-01-01,715.5328,,,
3,#BlackSeries,2021-02-01,766.6691,0.071466,715.5328,
4,#BlackSeries,2021-03-01,,-1.000000,766.6691,
...,...,...,...,...,...,...
27681,Zuma Topicals,2019-08-01,312.5153,,,
27682,Zuma Topicals,2019-09-01,464.3063,0.485707,312.5153,
27683,Zuma Topicals,2019-10-01,348.0579,-0.250370,464.3063,
27684,Zuma Topicals,2019-11-01,135.9220,-0.609484,348.0579,374.959833


### Merging the datasets

In [6]:
main_df = main_df.merge(BrandTotalSales_df)
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25279 entries, 0 to 25278
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Brands                      25279 non-null  object        
 1   Months                      25279 non-null  datetime64[ns]
 2   Total Units                 25279 non-null  float64       
 3   vs. Prior Period            22961 non-null  float64       
 4   Previous Month Total Units  22961 non-null  float64       
 5   Rolling Average             19100 non-null  float64       
 6   Total Sales ($)             25279 non-null  float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 1.5+ MB


Before we move on, I have a feeling that having year might be important. So, using months, we can get a new column:

In [7]:
def get_year(x):
    return x.year
main_df["Year"] = list(map(get_year, main_df["Months"]))
main_df.head()

Unnamed: 0,Brands,Months,Total Units,vs. Prior Period,Previous Month Total Units,Rolling Average,Total Sales ($),Year
0,#BlackSeries,2020-08-01,1616.33,,,,25352.1,2020
1,#BlackSeries,2021-01-01,715.5328,,,,9739.42,2021
2,#BlackSeries,2021-02-01,766.6691,0.071466,715.5328,,9102.8,2021
3,101 Cannabis Co.,2019-11-01,131.0677,,,,4465.04,2019
4,101 Cannabis Co.,2020-01-01,345.4134,,,,11790.6,2020


## 3. Create additional data feature engineering plan and implement it 

I want to add these new categories: percentage of products that a business has that is flower based, binary categorical feature that tells us if business provides inhalables or edible, binary categorical feature that tells us if business provides vape option, count of products in their profuct portfolio, percentage of products in their portfolio that is concentrate based

In [8]:
BrandDetails_df.columns = [c.replace(' ', '_') for c in BrandDetails_df.columns]
BrandDetails_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144977 entries, 0 to 144976
Data columns (total 25 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   State                144977 non-null  object 
 1   Channel              144977 non-null  object 
 2   Category_L1          144977 non-null  object 
 3   Category_L2          144977 non-null  object 
 4   Category_L3          144245 non-null  object 
 5   Category_L4          102618 non-null  object 
 6   Category_L5          50135 non-null   object 
 7   Brand                144977 non-null  object 
 8   Product_Description  144977 non-null  object 
 9   Total_Sales_($)      144977 non-null  object 
 10  Total_Units          144977 non-null  object 
 11  ARP                  144977 non-null  float64
 12  Flavor               7807 non-null    object 
 13  Items_Per_Pack       144977 non-null  int64  
 14  Item_Weight          64454 non-null   object 
 15  Total_THC        

In [9]:
# count of products in their product portfolio
# percentage of products that a business has that is flower based
# percentage of products in their portfolio that is concentrate based
for brand in brands:
    brand_product_count = BrandDetails_df[BrandDetails_df.Brand == brand].shape[0]
    flower_count = BrandDetails_df[(BrandDetails_df.Brand == brand) & (BrandDetails_df["Category_L2"] == "Flower")].shape[0]
    concentrates_count = BrandDetails_df[(BrandDetails_df.Brand == brand) & (BrandDetails_df["Category_L2"] == "Concentrates")].shape[0]
    if (brand_product_count == 0):
        brand_product_count = None
        main_df.loc[main_df.Brands==brand,"Product Count"] = brand_product_count
        main_df.loc[main_df.Brands==brand,"Flower Ratio"] = None
        main_df.loc[main_df.Brands==brand,"Concentrates Ratio"] = None
    else:
        main_df.loc[main_df.Brands==brand,"Product Count"] = brand_product_count
        main_df.loc[main_df.Brands==brand,"Flower Ratio"] = flower_count/brand_product_count
        main_df.loc[main_df.Brands==brand,"Concentrates Ratio"] = concentrates_count/brand_product_count
        


In [21]:
# binary variables
# binary categorical feature that tells us if business provides inhalables or edible
# binary categorical feature that tells us if business provides vape option

for brand in brands:
    value_inhale = 0
    value_edible = 0
    value_vape = 0
    brand_df = BrandDetails_df[BrandDetails_df.Brand == brand]
    if 'Inhaleables' in brand_df['Category_L1'].values:
        value_inhale = 1
    if 'Edibles' in brand_df['Category_L2'].values:
        value_edible = 1
    if 'Vape' in brand_df['Category_L3'].values:
        value_vape = 1
    
    main_df.loc[main_df.Brands==brand,"Inhaleables"] = value_inhale
    main_df.loc[main_df.Brands==brand,"Edibles"] = value_edible
    main_df.loc[main_df.Brands==brand,"Vape"] = value_vape

In [22]:
main_df
print(np.corrcoef(main_df.dropna()["Category_L1"], main_df.dropna()["Total Sales ($)"]))

KeyError: 'Category_L1'

Now, we are left with the task of: dropping fields, deploy an imputation strategy and augment at least one feature. The features I want to drop now is the Brands and Months, since we have collected and matched all the brand-wise data and have extracted the year from the month which I think its the most important. We also have extracted time series features, so Months will no longer be needed

In [265]:
main_df = main_df.drop("Brands",axis=1)
main_df = main_df.drop("Months",axis=1)

main_df.info()
main_df

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25279 entries, 0 to 25278
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Total Units                 25279 non-null  float64
 1   vs. Prior Period            22961 non-null  float64
 2   Previous Month Total Units  22961 non-null  float64
 3   Rolling Average             19100 non-null  float64
 4   Total Sales ($)             25279 non-null  float64
 5   Year                        25279 non-null  int64  
 6   Product Count               21472 non-null  float64
 7   Flower Ratio                21472 non-null  float64
 8   Concentrates Ratio          21472 non-null  float64
 9   Inhaleables                 25279 non-null  float64
 10  Edibles                     25279 non-null  float64
 11  Vape                        25279 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 2.5 MB


Unnamed: 0,Total Units,vs. Prior Period,Previous Month Total Units,Rolling Average,Total Sales ($),Year,Product Count,Flower Ratio,Concentrates Ratio,Inhaleables,Edibles,Vape
0,1616.3300,,,,25352.10,2020,4.0,1.0,0.000000,1.0,0.0,0.0
1,715.5328,,,,9739.42,2021,4.0,1.0,0.000000,1.0,0.0,0.0
2,766.6691,0.071466,715.5328,,9102.80,2021,4.0,1.0,0.000000,1.0,0.0,0.0
3,131.0677,,,,4465.04,2019,77.0,0.0,0.935065,1.0,0.0,0.0
4,345.4134,,,,11790.60,2020,77.0,0.0,0.935065,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
25274,604.2388,-0.636775,1663.5300,,5571.29,2020,,,,0.0,0.0,0.0
25275,312.5153,,,,9874.92,2019,,,,0.0,0.0,0.0
25276,464.3063,0.485707,312.5153,,17579.00,2019,,,,0.0,0.0,0.0
25277,348.0579,-0.250370,464.3063,,12024.00,2019,,,,0.0,0.0,0.0


To augment, I would want to creat a field that captures the absolute number of products that are flower based.

In [266]:
main_df["Num Flowers"] = main_df["Flower Ratio"] * main_df["Product Count"]

Finally, lets drop all rows with na values

In [267]:
main_df = main_df.dropna()

In [268]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16951 entries, 7 to 25261
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Total Units                 16951 non-null  float64
 1   vs. Prior Period            16951 non-null  float64
 2   Previous Month Total Units  16951 non-null  float64
 3   Rolling Average             16951 non-null  float64
 4   Total Sales ($)             16951 non-null  float64
 5   Year                        16951 non-null  int64  
 6   Product Count               16951 non-null  float64
 7   Flower Ratio                16951 non-null  float64
 8   Concentrates Ratio          16951 non-null  float64
 9   Inhaleables                 16951 non-null  float64
 10  Edibles                     16951 non-null  float64
 11  Vape                        16951 non-null  float64
 12  Num Flowers                 16951 non-null  float64
dtypes: float64(12), int64(1)
memory

## 4. Implement a basic Linear Regression predictive model for statistical hypothesis testing

Target: Total Sales, Label: Everything else left

In [269]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Splitting target and label
X = main_df.drop("Total Sales ($)", axis=1)
y = main_df["Total Sales ($)"]

# 70/30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

# Scaling
categorical_features = ["Inhaleables", "Edibles", "Vape"]
numerical_features = [x for x in X.columns if x not in categorical_features]
num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, numerical_features),
        ("cat", 'passthrough',categorical_features)
    ])

X_train_transformed = full_pipeline.fit_transform(X_train)

In [270]:
# Training a linear regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train_transformed, y_train)

# test data predictions
X_test_transformed = full_pipeline.transform(X_test)
predictions = lin_reg.predict(X_test_transformed)
regression_results(list(y_test), predictions)

explained_variance:  0.1878
mean_squared_log_error:  2.8389
r2:  0.1875
MAE:  137164.3198
MSE:  36528367982.9239
RMSE:  191123.9597


In [271]:
import statsmodels.api as sm
est = sm.OLS(y_train, X_train_transformed)
est2 = est.fit()
print(est2.summary())

                                 OLS Regression Results                                
Dep. Variable:        Total Sales ($)   R-squared (uncentered):                   0.531
Model:                            OLS   Adj. R-squared (uncentered):              0.530
Method:                 Least Squares   F-statistic:                              1117.
Date:                Wed, 16 Mar 2022   Prob (F-statistic):                        0.00
Time:                        04:17:11   Log-Likelihood:                     -1.6102e+05
No. Observations:               11865   AIC:                                  3.221e+05
Df Residuals:                   11853   BIC:                                  3.222e+05
Df Model:                          12                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## 6. Employ an ensemble method to your predictive model exercise 

In [272]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor


In [273]:
print(X_train_transformed.shape)
print(X_test_transformed.shape)
print(y_train.shape)
print(y_test.shape)

(11865, 12)
(5086, 12)
(11865,)
(5086,)


In [274]:
rfr = RandomForestRegressor(n_estimators=100, max_depth=5, max_features=2)
rfr.fit(X_train_transformed, y_train)
rfr_preds = rfr.predict(X_test)
regression_results(y_test, rfr_preds)

explained_variance:  0.5814
mean_squared_log_error:  1.1559
r2:  0.5814
MAE:  84465.3857
MSE:  18820007681.2188
RMSE:  137186.0331


## 7. Cross-Validate your training results 

Here, we will perform a 7-Fold cros validation for both the single regression and ensemble method (random forest)

In [278]:
# Single
from sklearn.model_selection import train_test_split, KFold

kf = KFold(n_splits=7, shuffle=True)
lin_reg = LinearRegression()
sum_explained_variance=0
sum_mean_absolute_error=0
sum_mse=0
sum_mean_squared_log_error=0
sum_median_absolute_error=0
sum_r2=0

df_transformed = pd.DataFrame(full_pipeline.transform(main_df))

for train_index, test_index in kf.split(df_transformed):
    #seperating data
    train_x = df_transformed.iloc[train_index, :]
    test_x = df_transformed.iloc[test_index, :]
    
    train_y = y.iloc[train_index,]
    test_y = y.iloc[test_index,]
    
    # training data
    lin_reg.fit(train_x, train_y)
    predictions = lin_reg.predict(test_x)
    
    test_y = list(test_y)

    # summing
    sum_explained_variance+=metrics.explained_variance_score(test_y, predictions)
    sum_mean_absolute_error+=metrics.mean_absolute_error(test_y, predictions) 
    sum_mse+=metrics.mean_squared_error(test_y, predictions) 
    sum_mean_squared_log_error+=metrics.mean_squared_log_error(test_y, predictions)
    sum_median_absolute_error+=metrics.median_absolute_error(test_y, predictions)
    sum_r2+=metrics.r2_score(test_y, predictions)
    
print("Average explained_variance: " + str(sum_explained_variance/7))
print("Average mean_absolute_error: " + str(sum_mean_absolute_error/7))
print("Average mse: " + str(sum_mse/7))
print("Average mean_squared_log_error: " + str(sum_mean_squared_log_error/7))
print("Average median_absolute_error: " + str(sum_median_absolute_error/7))
print("Average r2: " + str(sum_r2/7))

Average explained_variance: 0.18976235959269203
Average mean_absolute_error: 136151.6546924289
Average mse: 35519990636.49994
Average mean_squared_log_error: 2.8062073226861948
Average median_absolute_error: 99695.340314106
Average r2: 0.18946115358727558




In [279]:
# Ensemble

kf = KFold(n_splits=7, shuffle=True)
rfr = RandomForestRegressor(n_estimators=100, max_depth=5, max_features=2)
sum_explained_variance=0
sum_mean_absolute_error=0
sum_mse=0
sum_mean_squared_log_error=0
sum_median_absolute_error=0
sum_r2=0

for train_index, test_index in kf.split(df_transformed):
    #seperating data
    train_x = df_transformed.iloc[train_index, :]
    test_x = df_transformed.iloc[test_index, :]
    
    train_y = y.iloc[train_index,]
    test_y = y.iloc[test_index,]
    
    # training data
    rfr.fit(train_x, train_y)
    predictions = rfr.predict(test_x)
    
    test_y = list(test_y)

    # summing
    sum_explained_variance+=metrics.explained_variance_score(test_y, predictions)
    sum_mean_absolute_error+=metrics.mean_absolute_error(test_y, predictions) 
    sum_mse+=metrics.mean_squared_error(test_y, predictions) 
    sum_mean_squared_log_error+=metrics.mean_squared_log_error(test_y, predictions)
    sum_median_absolute_error+=metrics.median_absolute_error(test_y, predictions)
    sum_r2+=metrics.r2_score(test_y, predictions)
print("Average explained_variance: " + str(sum_explained_variance/7))
print("Average mean_absolute_error: " + str(sum_mean_absolute_error/7))
print("Average mse: " + str(sum_mse/7))
print("Average mean_squared_log_error: " + str(sum_mean_squared_log_error/7))
print("Average median_absolute_error: " + str(sum_median_absolute_error/7))
print("Average r2: " + str(sum_r2/7))

Average explained_variance: 0.5861490333827855
Average mean_absolute_error: 82917.7511938993
Average mse: 18167760217.9628
Average mean_squared_log_error: 1.1160411719968477
Average median_absolute_error: 37573.764265879254
Average r2: 0.5858325789422573


## 8. Employ a GridSearch method to optimize your parameters

In [280]:
param_grid = param_grid = [
  {'n_estimators': [1, 10, 100], 'max_depth': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_features': [1,2,3,4,5,6,7,8,9,10,"auto", "sqrt", "log2"]},  
]
rfr_CV = GridSearchCV(estimator=rfr, param_grid=param_grid, scoring='r2',verbose=3, cv=2)
rfr_CV.fit(X_train, y_train)
print(rfr_CV.best_params_)


# {'max_depth': 30, 'max_features': 'auto', 'n_estimators': 100}


Fitting 2 folds for each of 429 candidates, totalling 858 fits
[CV] max_depth=5, max_features=1, n_estimators=1 .....................
[CV]  max_depth=5, max_features=1, n_estimators=1, score=0.383, total=   0.0s
[CV] max_depth=5, max_features=1, n_estimators=1 .....................
[CV]  max_depth=5, max_features=1, n_estimators=1, score=0.243, total=   0.0s
[CV] max_depth=5, max_features=1, n_estimators=10 ....................
[CV]  max_depth=5, max_features=1, n_estimators=10, score=0.526, total=   0.0s
[CV] max_depth=5, max_features=1, n_estimators=10 ....................
[CV]  max_depth=5, max_features=1, n_estimators=10, score=0.553, total=   0.0s
[CV] max_depth=5, max_features=1, n_estimators=100 ...................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  max_depth=5, max_features=1, n_estimators=100, score=0.549, total=   0.2s
[CV] max_depth=5, max_features=1, n_estimators=100 ...................
[CV]  max_depth=5, max_features=1, n_estimators=100, score=0.548, total=   0.3s
[CV] max_depth=5, max_features=2, n_estimators=1 .....................
[CV]  max_depth=5, max_features=2, n_estimators=1, score=0.499, total=   0.0s
[CV] max_depth=5, max_features=2, n_estimators=1 .....................
[CV]  max_depth=5, max_features=2, n_estimators=1, score=0.481, total=   0.0s
[CV] max_depth=5, max_features=2, n_estimators=10 ....................
[CV]  max_depth=5, max_features=2, n_estimators=10, score=0.579, total=   0.0s
[CV] max_depth=5, max_features=2, n_estimators=10 ....................
[CV]  max_depth=5, max_features=2, n_estimators=10, score=0.586, total=   0.0s
[CV] max_depth=5, max_features=2, n_estimators=100 ...................
[CV]  max_depth=5, max_features=2, n_estimators=100, score=0.584, total=   0.3s
[CV] max_depth=5, ma

[CV]  max_depth=5, max_features=10, n_estimators=100, score=0.645, total=   0.9s
[CV] max_depth=5, max_features=auto, n_estimators=1 ..................
[CV]  max_depth=5, max_features=auto, n_estimators=1, score=0.603, total=   0.0s
[CV] max_depth=5, max_features=auto, n_estimators=1 ..................
[CV]  max_depth=5, max_features=auto, n_estimators=1, score=0.578, total=   0.0s
[CV] max_depth=5, max_features=auto, n_estimators=10 .................
[CV]  max_depth=5, max_features=auto, n_estimators=10, score=0.628, total=   0.1s
[CV] max_depth=5, max_features=auto, n_estimators=10 .................
[CV]  max_depth=5, max_features=auto, n_estimators=10, score=0.643, total=   0.1s
[CV] max_depth=5, max_features=auto, n_estimators=100 ................
[CV]  max_depth=5, max_features=auto, n_estimators=100, score=0.633, total=   0.9s
[CV] max_depth=5, max_features=auto, n_estimators=100 ................
[CV]  max_depth=5, max_features=auto, n_estimators=100, score=0.651, total=   0.9s
[

[CV]  max_depth=10, max_features=7, n_estimators=10, score=0.742, total=   0.1s
[CV] max_depth=10, max_features=7, n_estimators=100 ..................
[CV]  max_depth=10, max_features=7, n_estimators=100, score=0.731, total=   1.0s
[CV] max_depth=10, max_features=7, n_estimators=100 ..................
[CV]  max_depth=10, max_features=7, n_estimators=100, score=0.753, total=   1.1s
[CV] max_depth=10, max_features=8, n_estimators=1 ....................
[CV]  max_depth=10, max_features=8, n_estimators=1, score=0.548, total=   0.0s
[CV] max_depth=10, max_features=8, n_estimators=1 ....................
[CV]  max_depth=10, max_features=8, n_estimators=1, score=0.609, total=   0.0s
[CV] max_depth=10, max_features=8, n_estimators=10 ...................
[CV]  max_depth=10, max_features=8, n_estimators=10, score=0.713, total=   0.1s
[CV] max_depth=10, max_features=8, n_estimators=10 ...................
[CV]  max_depth=10, max_features=8, n_estimators=10, score=0.743, total=   0.1s
[CV] max_depth

[CV]  max_depth=20, max_features=3, n_estimators=10, score=0.747, total=   0.1s
[CV] max_depth=20, max_features=3, n_estimators=100 ..................
[CV]  max_depth=20, max_features=3, n_estimators=100, score=0.760, total=   1.0s
[CV] max_depth=20, max_features=3, n_estimators=100 ..................
[CV]  max_depth=20, max_features=3, n_estimators=100, score=0.769, total=   0.9s
[CV] max_depth=20, max_features=4, n_estimators=1 ....................
[CV]  max_depth=20, max_features=4, n_estimators=1, score=0.453, total=   0.0s
[CV] max_depth=20, max_features=4, n_estimators=1 ....................
[CV]  max_depth=20, max_features=4, n_estimators=1, score=0.513, total=   0.0s
[CV] max_depth=20, max_features=4, n_estimators=10 ...................
[CV]  max_depth=20, max_features=4, n_estimators=10, score=0.730, total=   0.1s
[CV] max_depth=20, max_features=4, n_estimators=10 ...................
[CV]  max_depth=20, max_features=4, n_estimators=10, score=0.756, total=   0.1s
[CV] max_depth

[CV]  max_depth=20, max_features=sqrt, n_estimators=100, score=0.764, total=   0.9s
[CV] max_depth=20, max_features=sqrt, n_estimators=100 ...............
[CV]  max_depth=20, max_features=sqrt, n_estimators=100, score=0.773, total=   0.9s
[CV] max_depth=20, max_features=log2, n_estimators=1 .................
[CV]  max_depth=20, max_features=log2, n_estimators=1, score=0.501, total=   0.0s
[CV] max_depth=20, max_features=log2, n_estimators=1 .................
[CV]  max_depth=20, max_features=log2, n_estimators=1, score=0.416, total=   0.0s
[CV] max_depth=20, max_features=log2, n_estimators=10 ................
[CV]  max_depth=20, max_features=log2, n_estimators=10, score=0.740, total=   0.1s
[CV] max_depth=20, max_features=log2, n_estimators=10 ................
[CV]  max_depth=20, max_features=log2, n_estimators=10, score=0.740, total=   0.1s
[CV] max_depth=20, max_features=log2, n_estimators=100 ...............
[CV]  max_depth=20, max_features=log2, n_estimators=100, score=0.761, total=

[CV]  max_depth=30, max_features=8, n_estimators=100, score=0.781, total=   1.7s
[CV] max_depth=30, max_features=9, n_estimators=1 ....................
[CV]  max_depth=30, max_features=9, n_estimators=1, score=0.485, total=   0.0s
[CV] max_depth=30, max_features=9, n_estimators=1 ....................
[CV]  max_depth=30, max_features=9, n_estimators=1, score=0.466, total=   0.0s
[CV] max_depth=30, max_features=9, n_estimators=10 ...................
[CV]  max_depth=30, max_features=9, n_estimators=10, score=0.734, total=   0.2s
[CV] max_depth=30, max_features=9, n_estimators=10 ...................
[CV]  max_depth=30, max_features=9, n_estimators=10, score=0.762, total=   0.2s
[CV] max_depth=30, max_features=9, n_estimators=100 ..................
[CV]  max_depth=30, max_features=9, n_estimators=100, score=0.768, total=   1.9s
[CV] max_depth=30, max_features=9, n_estimators=100 ..................
[CV]  max_depth=30, max_features=9, n_estimators=100, score=0.781, total=   1.8s
[CV] max_dept

[CV]  max_depth=40, max_features=4, n_estimators=100, score=0.774, total=   1.0s
[CV] max_depth=40, max_features=5, n_estimators=1 ....................
[CV]  max_depth=40, max_features=5, n_estimators=1, score=0.455, total=   0.0s
[CV] max_depth=40, max_features=5, n_estimators=1 ....................
[CV]  max_depth=40, max_features=5, n_estimators=1, score=0.492, total=   0.0s
[CV] max_depth=40, max_features=5, n_estimators=10 ...................
[CV]  max_depth=40, max_features=5, n_estimators=10, score=0.730, total=   0.1s
[CV] max_depth=40, max_features=5, n_estimators=10 ...................
[CV]  max_depth=40, max_features=5, n_estimators=10, score=0.752, total=   0.1s
[CV] max_depth=40, max_features=5, n_estimators=100 ..................
[CV]  max_depth=40, max_features=5, n_estimators=100, score=0.762, total=   1.2s
[CV] max_depth=40, max_features=5, n_estimators=100 ..................
[CV]  max_depth=40, max_features=5, n_estimators=100, score=0.779, total=   1.2s
[CV] max_dept

[CV]  max_depth=40, max_features=log2, n_estimators=100, score=0.774, total=   0.9s
[CV] max_depth=50, max_features=1, n_estimators=1 ....................
[CV]  max_depth=50, max_features=1, n_estimators=1, score=0.343, total=   0.0s
[CV] max_depth=50, max_features=1, n_estimators=1 ....................
[CV]  max_depth=50, max_features=1, n_estimators=1, score=0.367, total=   0.0s
[CV] max_depth=50, max_features=1, n_estimators=10 ...................
[CV]  max_depth=50, max_features=1, n_estimators=10, score=0.714, total=   0.1s
[CV] max_depth=50, max_features=1, n_estimators=10 ...................
[CV]  max_depth=50, max_features=1, n_estimators=10, score=0.729, total=   0.1s
[CV] max_depth=50, max_features=1, n_estimators=100 ..................
[CV]  max_depth=50, max_features=1, n_estimators=100, score=0.752, total=   0.6s
[CV] max_depth=50, max_features=1, n_estimators=100 ..................
[CV]  max_depth=50, max_features=1, n_estimators=100, score=0.760, total=   0.6s
[CV] max_d

[CV]  max_depth=50, max_features=10, n_estimators=10, score=0.746, total=   0.2s
[CV] max_depth=50, max_features=10, n_estimators=10 ..................
[CV]  max_depth=50, max_features=10, n_estimators=10, score=0.758, total=   0.2s
[CV] max_depth=50, max_features=10, n_estimators=100 .................
[CV]  max_depth=50, max_features=10, n_estimators=100, score=0.767, total=   2.0s
[CV] max_depth=50, max_features=10, n_estimators=100 .................
[CV]  max_depth=50, max_features=10, n_estimators=100, score=0.782, total=   2.0s
[CV] max_depth=50, max_features=auto, n_estimators=1 .................
[CV]  max_depth=50, max_features=auto, n_estimators=1, score=0.525, total=   0.0s
[CV] max_depth=50, max_features=auto, n_estimators=1 .................
[CV]  max_depth=50, max_features=auto, n_estimators=1, score=0.543, total=   0.0s
[CV] max_depth=50, max_features=auto, n_estimators=10 ................
[CV]  max_depth=50, max_features=auto, n_estimators=10, score=0.747, total=   0.2s
[

[CV]  max_depth=60, max_features=6, n_estimators=10, score=0.753, total=   0.2s
[CV] max_depth=60, max_features=6, n_estimators=100 ..................
[CV]  max_depth=60, max_features=6, n_estimators=100, score=0.767, total=   1.4s
[CV] max_depth=60, max_features=6, n_estimators=100 ..................
[CV]  max_depth=60, max_features=6, n_estimators=100, score=0.774, total=   1.4s
[CV] max_depth=60, max_features=7, n_estimators=1 ....................
[CV]  max_depth=60, max_features=7, n_estimators=1, score=0.487, total=   0.0s
[CV] max_depth=60, max_features=7, n_estimators=1 ....................
[CV]  max_depth=60, max_features=7, n_estimators=1, score=0.477, total=   0.0s
[CV] max_depth=60, max_features=7, n_estimators=10 ...................
[CV]  max_depth=60, max_features=7, n_estimators=10, score=0.741, total=   0.2s
[CV] max_depth=60, max_features=7, n_estimators=10 ...................
[CV]  max_depth=60, max_features=7, n_estimators=10, score=0.760, total=   0.2s
[CV] max_depth

[CV]  max_depth=70, max_features=2, n_estimators=100, score=0.759, total=   0.7s
[CV] max_depth=70, max_features=2, n_estimators=100 ..................
[CV]  max_depth=70, max_features=2, n_estimators=100, score=0.765, total=   0.7s
[CV] max_depth=70, max_features=3, n_estimators=1 ....................
[CV]  max_depth=70, max_features=3, n_estimators=1, score=0.452, total=   0.0s
[CV] max_depth=70, max_features=3, n_estimators=1 ....................
[CV]  max_depth=70, max_features=3, n_estimators=1, score=0.464, total=   0.0s
[CV] max_depth=70, max_features=3, n_estimators=10 ...................
[CV]  max_depth=70, max_features=3, n_estimators=10, score=0.739, total=   0.1s
[CV] max_depth=70, max_features=3, n_estimators=10 ...................
[CV]  max_depth=70, max_features=3, n_estimators=10, score=0.754, total=   0.1s
[CV] max_depth=70, max_features=3, n_estimators=100 ..................
[CV]  max_depth=70, max_features=3, n_estimators=100, score=0.761, total=   0.9s
[CV] max_dept

[CV]  max_depth=70, max_features=auto, n_estimators=100, score=0.783, total=   2.3s
[CV] max_depth=70, max_features=sqrt, n_estimators=1 .................
[CV]  max_depth=70, max_features=sqrt, n_estimators=1, score=0.474, total=   0.0s
[CV] max_depth=70, max_features=sqrt, n_estimators=1 .................
[CV]  max_depth=70, max_features=sqrt, n_estimators=1, score=0.501, total=   0.0s
[CV] max_depth=70, max_features=sqrt, n_estimators=10 ................
[CV]  max_depth=70, max_features=sqrt, n_estimators=10, score=0.730, total=   0.1s
[CV] max_depth=70, max_features=sqrt, n_estimators=10 ................
[CV]  max_depth=70, max_features=sqrt, n_estimators=10, score=0.739, total=   0.1s
[CV] max_depth=70, max_features=sqrt, n_estimators=100 ...............
[CV]  max_depth=70, max_features=sqrt, n_estimators=100, score=0.761, total=   0.9s
[CV] max_depth=70, max_features=sqrt, n_estimators=100 ...............
[CV]  max_depth=70, max_features=sqrt, n_estimators=100, score=0.772, total=

[CV]  max_depth=80, max_features=7, n_estimators=100, score=0.780, total=   1.6s
[CV] max_depth=80, max_features=8, n_estimators=1 ....................
[CV]  max_depth=80, max_features=8, n_estimators=1, score=0.505, total=   0.0s
[CV] max_depth=80, max_features=8, n_estimators=1 ....................
[CV]  max_depth=80, max_features=8, n_estimators=1, score=0.469, total=   0.0s
[CV] max_depth=80, max_features=8, n_estimators=10 ...................
[CV]  max_depth=80, max_features=8, n_estimators=10, score=0.742, total=   0.2s
[CV] max_depth=80, max_features=8, n_estimators=10 ...................
[CV]  max_depth=80, max_features=8, n_estimators=10, score=0.757, total=   0.2s
[CV] max_depth=80, max_features=8, n_estimators=100 ..................
[CV]  max_depth=80, max_features=8, n_estimators=100, score=0.766, total=   1.7s
[CV] max_depth=80, max_features=8, n_estimators=100 ..................
[CV]  max_depth=80, max_features=8, n_estimators=100, score=0.780, total=   1.7s
[CV] max_dept

[CV]  max_depth=90, max_features=3, n_estimators=100, score=0.771, total=   0.9s
[CV] max_depth=90, max_features=4, n_estimators=1 ....................
[CV]  max_depth=90, max_features=4, n_estimators=1, score=0.455, total=   0.0s
[CV] max_depth=90, max_features=4, n_estimators=1 ....................
[CV]  max_depth=90, max_features=4, n_estimators=1, score=0.477, total=   0.0s
[CV] max_depth=90, max_features=4, n_estimators=10 ...................
[CV]  max_depth=90, max_features=4, n_estimators=10, score=0.740, total=   0.1s
[CV] max_depth=90, max_features=4, n_estimators=10 ...................
[CV]  max_depth=90, max_features=4, n_estimators=10, score=0.765, total=   0.1s
[CV] max_depth=90, max_features=4, n_estimators=100 ..................
[CV]  max_depth=90, max_features=4, n_estimators=100, score=0.759, total=   1.1s
[CV] max_depth=90, max_features=4, n_estimators=100 ..................
[CV]  max_depth=90, max_features=4, n_estimators=100, score=0.773, total=   1.1s
[CV] max_dept

[CV]  max_depth=90, max_features=sqrt, n_estimators=100, score=0.771, total=   0.9s
[CV] max_depth=90, max_features=log2, n_estimators=1 .................
[CV]  max_depth=90, max_features=log2, n_estimators=1, score=0.478, total=   0.0s
[CV] max_depth=90, max_features=log2, n_estimators=1 .................
[CV]  max_depth=90, max_features=log2, n_estimators=1, score=0.452, total=   0.0s
[CV] max_depth=90, max_features=log2, n_estimators=10 ................
[CV]  max_depth=90, max_features=log2, n_estimators=10, score=0.731, total=   0.1s
[CV] max_depth=90, max_features=log2, n_estimators=10 ................
[CV]  max_depth=90, max_features=log2, n_estimators=10, score=0.751, total=   0.1s
[CV] max_depth=90, max_features=log2, n_estimators=100 ...............
[CV]  max_depth=90, max_features=log2, n_estimators=100, score=0.761, total=   0.9s
[CV] max_depth=90, max_features=log2, n_estimators=100 ...............
[CV]  max_depth=90, max_features=log2, n_estimators=100, score=0.768, total=

[CV]  max_depth=100, max_features=8, n_estimators=100, score=0.781, total=   1.7s
[CV] max_depth=100, max_features=9, n_estimators=1 ...................
[CV]  max_depth=100, max_features=9, n_estimators=1, score=0.486, total=   0.0s
[CV] max_depth=100, max_features=9, n_estimators=1 ...................
[CV]  max_depth=100, max_features=9, n_estimators=1, score=0.522, total=   0.0s
[CV] max_depth=100, max_features=9, n_estimators=10 ..................
[CV]  max_depth=100, max_features=9, n_estimators=10, score=0.742, total=   0.2s
[CV] max_depth=100, max_features=9, n_estimators=10 ..................
[CV]  max_depth=100, max_features=9, n_estimators=10, score=0.761, total=   0.2s
[CV] max_depth=100, max_features=9, n_estimators=100 .................
[CV]  max_depth=100, max_features=9, n_estimators=100, score=0.765, total=   1.9s
[CV] max_depth=100, max_features=9, n_estimators=100 .................
[CV]  max_depth=100, max_features=9, n_estimators=100, score=0.783, total=   1.8s
[CV] m

[Parallel(n_jobs=1)]: Done 858 out of 858 | elapsed:  6.5min finished


{'max_depth': 30, 'max_features': 'auto', 'n_estimators': 100}


In [282]:
rfr_optimized = RandomForestRegressor(n_estimators=100, max_depth=30, max_features='auto')
rfr_optimized.fit(X_train_transformed, y_train)
rfr_optimized_preds = rfr_optimized.predict(X_test)
regression_results(y_test, rfr_optimized_preds)

explained_variance:  0.7962
mean_squared_log_error:  0.3545
r2:  0.7961
MAE:  44787.2424
MSE:  9165170695.9605
RMSE:  95734.898


## 9. Experiment with your own custom models and report out your highest performing model 

To further experiment with more models, I will be exploring Ridge regression and perform a grid search on its alpha parameter to see which gives us the best among ridge regerssions, and if the best ridge regression is better than the best what we have right now. (grid search on rfr). But first, lets choose an arbitratry alpha (=0.5)

In [287]:
from sklearn import linear_model
reg = linear_model.Ridge(alpha=.5)
reg.fit(X_train_transformed, y_train)
reg_preds = reg.predict(X_test)
regression_results(y_test, reg_preds)

explained_variance:  0.1878
mean_squared_log_error:  2.8438
r2:  0.1877
MAE:  137325.6109
MSE:  36522442603.8896
RMSE:  191108.4577


Here, we see that by performing  ridge, we dont get that big of a boost comapred to standard linear regression, and let alone the grid search we did on rfr. Lets try grid search on this

In [289]:
param_grid = param_grid = [
  {'alpha': [0.1,1, 10, 100,1000,10000]},  
]

reg_CV = GridSearchCV(estimator=reg, param_grid=param_grid, scoring='r2',verbose=3, cv=2)
reg_CV.fit(X_train_transformed, y_train)
print(reg_CV.best_params_)

# {'alpha': 10}


Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.190, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.192, total=   0.0s
[CV] alpha=1 .........................................................
[CV] ............................. alpha=1, score=0.190, total=   0.0s
[CV] alpha=1 .........................................................
[CV] ............................. alpha=1, score=0.192, total=   0.0s
[CV] alpha=10 ........................................................
[CV] ............................ alpha=10, score=0.190, total=   0.0s
[CV] alpha=10 ........................................................
[CV] ............................ alpha=10, score=0.192, total=   0.0s
[CV] alpha=100 .......................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.1s finished


In [290]:
reg_CV = linear_model.Ridge(alpha=10)
reg_CV.fit(X_train_transformed, y_train)
reg_CV_preds = reg_CV.predict(X_test)
regression_results(y_test, reg_CV_preds)

explained_variance:  0.1878
mean_squared_log_error:  2.8466
r2:  0.1877
MAE:  137341.2638
MSE:  36522711337.6038
RMSE:  191109.1608


Hmmm, no improvement here. Lets try LASSO, however it might be resulting in the same result as linear regression might not be the best for this. Lets see!

In [291]:
lasso = linear_model.Lasso(alpha=.5)
lasso.fit(X_train_transformed, y_train)
lasso_preds = lasso.predict(X_test)
regression_results(y_test, lasso_preds)

explained_variance:  0.1878
mean_squared_log_error:  2.8437
r2:  0.1877
MAE:  137324.9787
MSE:  36522438969.066
RMSE:  191108.4482


No improvement again, lets try grid search one last time

In [293]:
lasso_CV = GridSearchCV(estimator=lasso, param_grid=param_grid, scoring='r2',verbose=3, cv=2)
lasso_CV.fit(X_train_transformed, y_train)
print(lasso_CV.best_params_)
# {'alpha': 100}

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.190, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.192, total=   0.0s
[CV] alpha=1 .........................................................
[CV] ............................. alpha=1, score=0.190, total=   0.0s
[CV] alpha=1 .........................................................
[CV] ............................. alpha=1, score=0.192, total=   0.0s
[CV] alpha=10 ........................................................
[CV] ............................ alpha=10, score=0.190, total=   0.0s
[CV] alpha=10 ........................................................
[CV] ............................ alpha=10, score=0.192, total=   0.0s
[CV] alpha=100 .......................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.2s finished


In [294]:
lasso_CV = linear_model.Lasso(alpha=100)
lasso_CV.fit(X_train_transformed, y_train)
lasso_CV_preds = lasso_CV.predict(X_test)
regression_results(y_test, lasso_CV_preds)

explained_variance:  0.1878
mean_squared_log_error:  2.8492
r2:  0.1876
MAE:  137365.2571
MSE:  36524430242.9245
RMSE:  191113.6579
