In [1]:
#Data loading and imports
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Importing the datasets using the paths provided
sales_daily = pd.read_csv('/Users/arka_bagchi/Desktop/Springboard/pharma_sales_data/salesdaily.csv')

In [2]:
# Convert 'datum' column to datetime format
sales_daily['datum'] = pd.to_datetime(sales_daily['datum'])

In [3]:
sales_daily.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2106 entries, 0 to 2105
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   datum         2106 non-null   datetime64[ns]
 1   M01AB         2106 non-null   float64       
 2   M01AE         2106 non-null   float64       
 3   N02BA         2106 non-null   float64       
 4   N02BE         2106 non-null   float64       
 5   N05B          2106 non-null   float64       
 6   N05C          2106 non-null   float64       
 7   R03           2106 non-null   float64       
 8   R06           2106 non-null   float64       
 9   Year          2106 non-null   int64         
 10  Month         2106 non-null   int64         
 11  Hour          2106 non-null   int64         
 12  Weekday Name  2106 non-null   object        
dtypes: datetime64[ns](1), float64(8), int64(3), object(1)
memory usage: 214.0+ KB


In [4]:
# Creating dummy variables for the 'Weekday Name' column
weekday_dummies = pd.get_dummies(sales_daily['Weekday Name'], prefix='Weekday')

# Concatenate the original DataFrame and the dummy DataFrame
sales_daily = pd.concat([sales_daily, weekday_dummies], axis=1)

# Dropping the original 'Weekday Name' column as it's now redundant
sales_daily.drop('Weekday Name', axis=1, inplace=True)

# Displaying the first few rows of the modified dataset
sales_daily.head()


Unnamed: 0,datum,M01AB,M01AE,N02BA,N02BE,N05B,N05C,R03,R06,Year,Month,Hour,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
0,2014-01-02,0.0,3.67,3.4,32.4,7.0,0.0,0.0,2.0,2014,1,248,0,0,0,0,1,0,0
1,2014-01-03,8.0,4.0,4.4,50.6,16.0,0.0,20.0,4.0,2014,1,276,1,0,0,0,0,0,0
2,2014-01-04,2.0,1.0,6.5,61.85,10.0,0.0,9.0,1.0,2014,1,276,0,0,1,0,0,0,0
3,2014-01-05,4.0,3.0,7.0,41.1,8.0,0.0,3.0,0.0,2014,1,276,0,0,0,1,0,0,0
4,2014-01-06,5.0,1.0,4.5,21.7,16.0,2.0,6.0,2.0,2014,1,276,0,1,0,0,0,0,0


In [5]:
# Selecting columns to be standardized
cols_to_scale = ['M01AB', 'M01AE', 'N02BA', 'N02BE', 'N05B', 'N05C', 'R03', 'R06', 'Year', 'Month', 'Hour']

# Initialize the scaler
scaler = StandardScaler()

# Standardize the selected columns
sales_daily[cols_to_scale] = scaler.fit_transform(sales_daily[cols_to_scale])

# Displaying the first few rows of the standardized dataset
sales_daily.head()


Unnamed: 0,datum,M01AB,M01AE,N02BA,N02BE,N05B,N05C,R03,R06,Year,Month,Hour,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
0,2014-01-02,-1.839172,-0.105883,-0.201574,0.159291,-0.330752,-0.543156,-0.857645,-0.372716,-1.442473,-1.578269,-14.185153,0,0,0,0,1,0,0
1,2014-01-03,1.083812,0.048841,0.217987,1.326911,1.275165,-0.543156,2.254126,0.455359,-1.442473,-1.578269,0.027477,1,0,0,0,0,0,0
2,2014-01-04,-1.108426,-1.357741,1.099065,2.048654,0.204553,-0.543156,0.542652,-0.786753,-1.442473,-1.578269,0.027477,0,0,1,0,0,0,0
3,2014-01-05,-0.37768,-0.42002,1.308845,0.717439,-0.152317,-0.543156,-0.390879,-1.20079,-1.442473,-1.578269,0.027477,0,0,0,1,0,0,0
4,2014-01-06,-0.012307,-1.357741,0.259943,-0.527167,1.275165,1.287124,0.075886,-0.372716,-1.442473,-1.578269,0.027477,0,1,0,0,0,0,0


In [8]:
from sklearn.model_selection import train_test_split

# Dropping the 'datum' column as it won't be used in model training
X = sales_daily.drop(['datum', 'M01AE'], axis=1)  # Also drop 'M01AE' from features to avoid data leakage

# Using 'M01AE' as the target variable
y = sales_daily['M01AE']

# Splitting the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((1684, 17), (422, 17), (1684,), (422,))

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Training the Linear Regression model on the updated dataset with 'M01AE' as the target variable
lr_m01ae = LinearRegression()
lr_m01ae.fit(X_train, y_train)

# Predicting on the test data
y_pred_m01ae = lr_m01ae.predict(X_test)

# Calculating the evaluation metrics
mae_m01ae = mean_absolute_error(y_test, y_pred_m01ae)
rmse_m01ae = np.sqrt(mean_squared_error(y_test, y_pred_m01ae))

mae_m01ae, rmse_m01ae


(0.7367184837810551, 0.9256635780902757)

In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# Initialize the Decision Tree Regressor
dt_reg = DecisionTreeRegressor()

# Hyperparameters to be tuned
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Using GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=dt_reg, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Best parameters from the grid search
best_params = grid_search.best_params_

# Predict on the test data using the best model
y_pred_dt = grid_search.best_estimator_.predict(X_test)

# Calculate and display the evaluation metrics for Decision Tree Regressor
mae_dt = mean_absolute_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))

best_params, mae_dt, rmse_dt


Fitting 5 folds for each of 45 candidates, totalling 225 fits


({'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5},
 0.757716202513481,
 0.9723872178303138)

In [12]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor
rf_reg = RandomForestRegressor()

# Hyperparameters to be tuned for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Using GridSearchCV for hyperparameter tuning of Random Forest
grid_search_rf = GridSearchCV(estimator=rf_reg, param_grid=param_grid_rf, 
                              scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# Fit the model to the training data
grid_search_rf.fit(X_train, y_train)

# Best parameters from the grid search for Random Forest
best_params_rf = grid_search_rf.best_params_

# Predict on the test data using the best Random Forest model
y_pred_rf = grid_search_rf.best_estimator_.predict(X_test)

# Calculate and display the evaluation metrics for Random Forest Regressor
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

best_params_rf, mae_rf, rmse_rf


Fitting 5 folds for each of 108 candidates, totalling 540 fits


({'max_depth': 5,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 200},
 0.7300367838779914,
 0.9260756776005638)