<a href="https://colab.research.google.com/github/carlolopez03/Prediction-of-Product-Sales/blob/main/Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Prediction of Sales**
##Carlo Lopez

##**Load Data**

In [18]:
# Imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import set_config
set_config(transform_output='pandas')

In [2]:
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  #Metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:

    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics

def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  #Predictions for training data
  y_train_pred = reg.predict(X_train)

  # Calling helper function to obtain regression metrics
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  #Predictions for test data
  y_test_pred = reg.predict(X_test)
  # Calling helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )

  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    results_df = results_df.set_index('Label')
    results_df.index.name=None
    # Returning dataframe
    return results_df.round(3)

In [3]:
#Loading data
file = '/content/drive/MyDrive/CodingDojo/02-IntroML/Week05/Data/sales_predictions_2023.csv'
df = pd.read_csv(file)
df.head()


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


##**Data Cleaning**

In [4]:
#Inspecting data
df.info()
print("\nMissing Values:", df.isna().sum().sum())
print("\nDuplicated Rows:", df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB

Missing Values: 3873

Duplicated Rows: 0


In [5]:
df['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: count, dtype: int64

In [6]:
df['Item_Fat_Content'].replace({'LF':'Low Fat', 'low fat':'Low Fat', 'reg':'Regular'}, inplace = True)
df['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5517
Regular    3006
Name: count, dtype: int64

##**Machine Learning**

In [7]:
#Defining the features and target
target = 'Item_Outlet_Sales'
y = df[target]
X = df.drop(columns=[target, 'Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year', 'Item_Weight'])

#Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type
4776,Low Fat,0.029565,Household,256.4646,Medium,Tier 3,Supermarket Type2
7510,Regular,0.0,Snack Foods,179.766,Medium,Tier 3,Supermarket Type2
5828,Regular,0.158716,Meat,157.2946,Medium,Tier 1,Supermarket Type1
5327,Low Fat,0.014628,Baking Goods,82.325,Small,Tier 2,Supermarket Type1
4810,Low Fat,0.016645,Frozen Foods,120.9098,,Tier 2,Supermarket Type1


In [8]:
X_test.head()

Unnamed: 0,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type
7503,Low Fat,0.0263,Frozen Foods,79.4302,High,Tier 3,Supermarket Type1
2957,Low Fat,0.071136,Health and Hygiene,42.7086,Small,Tier 1,Supermarket Type1
7031,Regular,0.041313,Canned,42.0454,Medium,Tier 1,Supermarket Type1
1084,Regular,0.044767,Soft Drinks,173.7054,Medium,Tier 3,Supermarket Type3
856,Regular,0.012456,Meat,197.511,Small,Tier 2,Supermarket Type1


In [9]:
#Defining numeric features
num_cols = X_train.select_dtypes('number').columns
num_cols

Index(['Item_Visibility', 'Item_MRP'], dtype='object')

In [10]:
#Processors
scaler = StandardScaler()

#Pipeline for numeric feature
num_pipe = make_pipeline(scaler)
num_pipe

In [11]:
#Defining ordinal features
ordinal_cols = ['Outlet_Location_Type', 'Outlet_Size']
loc_type_list = ['Tier 1', 'Tier 2', 'Tier 3']
size_list = ['Small', 'Medium', 'High']

#Processors
ord = OrdinalEncoder(categories=[loc_type_list, size_list])
freq_imputer = SimpleImputer(strategy='most_frequent', fill_value='Missing')

#Pipeline for ordinal features
ord_pipeline = make_pipeline(freq_imputer, ord)
ord_pipeline

In [12]:
#Defining nominal features
nominal_cols = X_train.select_dtypes('object').drop(columns=ordinal_cols).columns

#Processor
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

#Pipeline for nominal features
nom_pipeline = make_pipeline(ohe)
nom_pipeline

In [13]:
#Defining tuples
numeric_tuple = ('numeric', num_pipe, num_cols)
ohe_tuple = ('categorical', nom_pipeline, nominal_cols)
ord_tuple = ('ordinal', ord_pipeline, ordinal_cols)

#Making column transformer
col_transformer = ColumnTransformer([numeric_tuple,ord_tuple, ohe_tuple], verbose_feature_names_out=False)

#Fitting transformer
col_transformer.fit(X_train)

In [14]:
X_train_proc = col_transformer.transform(X_train)
X_train_proc.head()

Unnamed: 0,Item_Visibility,Item_MRP,Outlet_Location_Type,Outlet_Size,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
4776,-0.712775,1.828109,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7510,-1.291052,0.603369,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5828,1.813319,0.244541,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5327,-1.004931,-0.952591,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4810,-0.965484,-0.33646,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [15]:
X_test_proc = col_transformer.transform(X_test)
X_test_proc.head()

Unnamed: 0,Item_Visibility,Item_MRP,Outlet_Location_Type,Outlet_Size,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
7503,-0.776646,-0.998816,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2957,0.100317,-1.585194,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7031,-0.482994,-1.595784,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1084,-0.41544,0.506592,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
856,-1.047426,0.886725,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [22]:
lin_reg = LinearRegression()

lin_reg.fit(X_train_proc, y_train)

In [23]:
evaluate_regression(lin_reg, X_train_proc, y_train, X_test_proc, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 847.428
- MSE = 1,300,527.987
- RMSE = 1,140.407
- R^2 = 0.561

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 805.272
- MSE = 1,197,408.485
- RMSE = 1,094.262
- R^2 = 0.566


This model is underfitting because there is a lot of errors.

In [24]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_proc, y_train)

In [27]:
evaluate_regression(rf, X_train_proc, y_train, X_test_proc, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 299.032
- MSE = 186,263.584
- RMSE = 431.583
- R^2 = 0.937

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 775.737
- MSE = 1,243,557.559
- RMSE = 1,115.149
- R^2 = 0.549


 underfit yadadadadadad hhfheu yaw mean

In [29]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [31]:
params = {'max_depth': [None,10,15,20],
          'n_estimators':[10,100,150,200],
          'min_samples_leaf':[2,3,4],
          'max_features':['sqrt','log2',None],
          'oob_score':[True,False],}
gridsearch = GridSearchCV(rf, params, n_jobs=-1, cv=2,verbose=1)
gridsearch.fit(X_train_proc, y_train)

Fitting 2 folds for each of 288 candidates, totalling 576 fits


In [32]:
gridsearch.best_params_

{'max_depth': 10,
 'max_features': None,
 'min_samples_leaf': 4,
 'n_estimators': 200,
 'oob_score': True}

In [33]:
best_rf = gridsearch.best_estimator_
evaluate_regression(best_rf, X_train_proc, y_train, X_test_proc, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 667.363
- MSE = 908,029.936
- RMSE = 952.906
- R^2 = 0.693

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 735.221
- MSE = 1,117,197.089
- RMSE = 1,056.975
- R^2 = 0.595


This model is overfitting because the training is performing more accurate than the testing data

I believe the random forests model has better test scores

In [None]:
model = DecisionTreeRegressor(random_state = 42)
model.fit(X_train_tf, y_train)
evaluate_regression(model, X_train_tf, y_train, X_test_tf, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 0.000
- MSE = 0.000
- RMSE = 0.000
- R^2 = 1.000

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 1,061.021
- MSE = 2,373,918.346
- RMSE = 1,540.753
- R^2 = 0.140


In [None]:
model.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 42,
 'splitter': 'best'}

In [None]:
param_grid = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None], 'min_samples_split': [2, 3, 4]}

In [None]:
grid_search = GridSearchCV(model, param_grid, n_jobs = -1, verbose= 1)
grid_search.fit(X_train_tf, y_train)

Fitting 5 folds for each of 33 candidates, totalling 165 fits


In [None]:
grid_search.best_params_

{'max_depth': 5, 'min_samples_split': 2}

In [None]:
best_model = grid_search.best_estimator_
evaluate_regression(best_model, X_train_tf, y_train, X_test_tf, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 762.610
- MSE = 1,172,122.773
- RMSE = 1,082.646
- R^2 = 0.604

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 738.317
- MSE = 1,118,185.973
- RMSE = 1,057.443
- R^2 = 0.595


I recommend the random forests model because it gave us the best results for the testing data(.595).

R2 in our model shows us that it is able to predict 59% of the data

The MAE in both models are more close to each other than the previous models. this tells us the predictions are more accurate

This model is overlift because it seems to recognize a pattern in the data