In [6]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the data into a pandas dataframe

Train_df = pd.read_csv('train.csv')
stores_df = pd.read_csv('stores.csv')
Transaction_df = pd.read_csv('transactions.csv')
oil_df = pd.read_csv('oil.csv')
submission_df = pd.read_csv('submission.csv')
test_df = pd.read_csv('test.csv')
Holiday_df = pd.read_csv('holidays_events.csv')


merged_df = pd.merge(Train_df, stores_df, on='store_nbr', how='left')
merged_df = pd.merge(merged_df,Transaction_df, on=['store_nbr','date'], how='left')
merged_df = pd.merge(merged_df,oil_df, on=['date'], how='left')
data = pd.merge(merged_df,Holiday_df, on=['date'], how='left')

#rename column
data.rename(columns = {'type_x':'Store_type'}, inplace = True)
data.rename(columns = {'type_y':'Holiday_type'}, inplace = True)

# Drop the id column since it's not relevant for modeling
data = data.drop('id', axis=1)

# Convert the date column to a datetime object
data['date'] = pd.to_datetime(data['date'])

# Set the date column as the index
data = data.set_index('date')

#Creating the Day, Month and Year column from the Date Column
data['year'] = data.index.year
data['month'] =data.index.month
data['day']=data.index.day
#data['dayofweek']=data.index.weekday
#data['end_month']=data.index.is_month_end

#Implement the new super grouping of product family on the actual family attribute. 

data['family'] = data['family'].replace({
'AUTOMOTIVE': 'Others',
'BABY CARE': 'Personal Care',
'BEAUTY': 'Personal Care',
'BEVERAGES': 'Beverages',
'BOOKS': 'Others',
'BREAD/BAKERY': 'Food',
'CELEBRATION': 'Food',
'CLEANING': 'Others',
'DAIRY': 'Food',
'DELI': 'Food',
'EGGS': 'Food',
'FROZEN FOODS': 'Food',
'GROCERY I': 'Food',
'GROCERY II': 'Food',
'HARDWARE': 'Others',
'HOME AND KITCHEN I': 'Home and Kitchen',
'HOME AND KITCHEN II': 'Home and Kitchen',
'HOME APPLIANCES': 'Home and Kitchen',
'HOME CARE': 'Home and Kitchen',
'LADIESWEAR': 'Clothing',
'LAWN AND GARDEN': 'Others',
'LINGERIE': 'Clothing',
'LIQUOR,WINE,BEER': 'Beverages',
'MAGAZINES': 'Others',
'MEATS': 'Food',
'PERSONAL CARE': 'Personal Care',
'PET SUPPLIES': 'Others',
'PLAYERS AND ELECTRONICS': 'Others',
'POULTRY': 'Food',
'PREPARED FOODS': 'Food',
'PRODUCE': 'Food',
'SCHOOL AND OFFICE SUPPLIES': 'Others',
'SEAFOOD': 'Food'
})

data['Holiday_type'] = np.where(data['Holiday_type'].isin(['Holiday', 
                                                                     'Additional', 'Event', 'Transfer', 'Bridge']), 
                                                                                                                'Holiday', 'Workday')
data = data.drop(['locale', 'locale_name', 'description', 'state', 'transferred'], axis=1)


# Identify numeric and non-numeric columns
num_cols = data.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = data.select_dtypes(exclude=[np.number]).columns.tolist()

# Creating imputer variables
numerical_imputer = SimpleImputer(strategy = "mean")
categorical_imputer = SimpleImputer(strategy = "most_frequent")


# Define the column transformer
categorical_features = cat_cols
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto', sparse=False))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

from sklearn.model_selection import train_test_split

import numpy as np


# resample numeric columns by mean and categorical columns by mode
resampled = data.resample('D').agg({**{col: 'mean' for col in num_cols}, **{col: (lambda x: x.mode()[0] if not x.mode().empty else np.nan) for col in cat_cols}}).reset_index()

resampled = resampled.drop('date', axis=1)


# Filling missing values in numerical features of training set
resampled[num_cols] = numerical_imputer.fit_transform(resampled[num_cols])

resampled[cat_cols] = categorical_imputer.fit_transform(resampled[cat_cols])

# Filling missing values in numerical features of evaluation set
#X_eval_df[num_cols] = numerical_imputer.transform(X_eval_df[num_cols])


# Split the data into training and testing sets
# Calculate the number of rows in the data

n_rows = resampled.shape[0]

# Calculate the split point
split_point = int(n_rows * 0.90)

# Select the first 85% of the rows as the training data
X_train = resampled.iloc[:split_point]
y_train = X_train['sales']
X_train = X_train.drop('sales', axis=1)

# Select the remaining 15% of the rows as the validation data
X_eval = resampled.iloc[split_point:]
y_eval = X_eval['sales']
X_eval = X_eval.drop('sales', axis=1)


'''creating copy of the categorical features and numerical features
before imputing null value to avoid modifying the orginal dataset'''

num_cols.remove('sales')  # remove 'sales' from num_cols

X_train_cat = X_train[cat_cols].copy()
X_train_num = X_train[num_cols].copy()


X_eval_cat = X_eval[cat_cols].copy()
X_eval_num = X_eval[num_cols].copy()



# Fitting the Imputer
X_train_cat_imputed = categorical_imputer.fit_transform(X_train_cat)
X_train_num_imputed = numerical_imputer.fit_transform(X_train_num)

X_eval_cat_imputed = categorical_imputer.fit_transform(X_eval_cat)
X_eval_num_imputed = numerical_imputer.fit_transform(X_eval_num)


encoder=OneHotEncoder(handle_unknown='ignore')

# encoding the xtrain categories and converting to a dataframe
X_train_cat_encoded = encoder.fit(X_train_cat_imputed)
X_train_cat_encoded = pd.DataFrame(encoder.transform(X_train_cat_imputed).toarray(),
                                   columns=encoder.get_feature_names_out(cat_cols))

# encoding the xeval categories and converting to a dataframe
X_eval_cat_encoded = encoder.fit(X_eval_cat_imputed)
X_eval_cat_encoded = pd.DataFrame(encoder.transform(X_eval_cat_imputed).toarray(),
                                   columns=encoder.get_feature_names_out(cat_cols))


scaler= StandardScaler()

X_train_num_scaled = scaler.fit_transform(X_train_num_imputed)
X_train_num_sc = pd.DataFrame(X_train_num_scaled, columns = num_cols)

X_eval_num_scaled = scaler.fit_transform(X_eval_num_imputed)
X_eval_num_sc = pd.DataFrame(X_eval_num_scaled, columns = num_cols)

X_train_df = pd.concat([X_train_num_sc,X_train_cat_encoded], axis =1)
X_eval_df = pd.concat([X_eval_num_sc,X_eval_cat_encoded], axis =1)



from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import cross_val_score

# create a dictionary of models to fit
models = {
    'Random Forest Regressor': RandomForestRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor()
}

# iterate over the models and fit each one to the training data
for name, model in models.items():
    model.fit(X_train_df, y_train)
    
# evaluate each model using cross-validation
rmsle_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train_df, y_train, cv=50, scoring='neg_mean_squared_log_error')
    rmsle_scores[name] = np.sqrt(-scores.mean())
    
# print the RMSLE scores for each model
for name, score in rmsle_scores.items():
    print(f'{name}: {score}')

# choose the model with the lowest RMSLE score
best_model_name = min(rmsle_scores, key=rmsle_scores.get)
best_model = models[best_model_name]
print(f'Best model: {best_model_name}')


Traceback (most recent call last):
  File "C:\Users\Asiak\AppData\Roaming\Python\Python38\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Asiak\AppData\Roaming\Python\Python38\site-packages\sklearn\metrics\_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\Asiak\AppData\Roaming\Python\Python38\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\Asiak\AppData\Roaming\Python\Python38\site-packages\sklearn\metrics\_regression.py", line 521, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



Random Forest Regressor: 0.2340414516143391
Decision Tree Regressor: 0.3627989425325001
Gradient Boosting Regressor: nan
Best model: Random Forest Regressor


In [10]:
from joblib import dump
import os

# set the destination path to the "export" directory
destination = "."

# create a dictionary to store the objects and their filenames
models = {"numerical_imputer": numerical_imputer,
          "categorical_imputer": categorical_imputer,
          "encoder": encoder,
          "scaler": scaler,
          "Final_model": best_model}

# loop through the models and save them using joblib.dump()
for name, model in models.items():
    dump(model, os.path.join(destination, f"{name}.joblib"))


In [None]:
#!pip freeze > requirements.txt

In [11]:
pip install pipreqs

Collecting pipreqs
  Downloading pipreqs-0.4.11-py2.py3-none-any.whl (32 kB)
Collecting yarg
  Downloading yarg-0.1.9-py2.py3-none-any.whl (19 kB)
Collecting docopt
  Downloading docopt-0.6.2.tar.gz (25 kB)
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py): started
  Building wheel for docopt (setup.py): finished with status 'done'
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13705 sha256=2c0a74905833ab0ed30f37b295e1c7899d2044ee4c448781f41545c64172c56b
  Stored in directory: c:\users\asiak\appdata\local\pip\cache\wheels\56\ea\58\ead137b087d9e326852a851351d1debf4ada529b6ac0ec4e8c
Successfully built docopt
Installing collected packages: yarg, docopt, pipreqs
Successfully installed docopt-0.6.2 pipreqs-0.4.11 yarg-0.1.9
Note: you may need to restart the kernel to use updated packages.


In [12]:
!pipreqs . --force

INFO: Successfully saved requirements file in .\requirements.txt


In [15]:
resampled.columns.tolist()

['store_nbr',
 'sales',
 'onpromotion',
 'cluster',
 'transactions',
 'dcoilwtico',
 'year',
 'month',
 'day',
 'family',
 'city',
 'Store_type',
 'Holiday_type']

In [16]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1519 entries, 0 to 1518
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   store_nbr     1519 non-null   float64
 1   onpromotion   1519 non-null   float64
 2   cluster       1519 non-null   float64
 3   transactions  1519 non-null   float64
 4   dcoilwtico    1519 non-null   float64
 5   year          1519 non-null   float64
 6   month         1519 non-null   float64
 7   day           1519 non-null   float64
 8   family        1519 non-null   object 
 9   city          1519 non-null   object 
 10  Store_type    1519 non-null   object 
 11  Holiday_type  1519 non-null   object 
dtypes: float64(8), object(4)
memory usage: 142.5+ KB


In [17]:
resampled.Holiday_type.unique()

array(['Holiday', 'Workday'], dtype=object)

In [18]:
data.to_csv('resampledCmplete.csv', index=False)

In [19]:
import sklearn
print(sklearn.__version__)


1.0.2


In [20]:
!pip install --upgrade scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp38-cp38-win_amd64.whl (8.3 MB)
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
Installing collected packages: joblib, scikit-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.0.1
    Uninstalling joblib-1.0.1:


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'c:\\programdata\\anaconda3\\lib\\site-packages\\joblib-1.0.1.dist-info\\direct_url.json'
Consider using the `--user` option or check the permissions.

