<a href="https://colab.research.google.com/github/Zaman-SE/Machine-Learning/blob/main/Approaching_Machine_Learning_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

install and import required libraries

In [None]:
!pip install numpy pandas matplotlib plotly seaborn --quiet

In [None]:
!pip install jovian opendatasets scikit-learn --upgrade --quiet

In [None]:
import os
import jovian
import matplotlib
import opendatasets as od
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'


# Downloading Data

In [None]:
od.download('https://www.kaggle.com/c/rossmann-store-sales')

In [None]:
os.listdir('rossmann-store-sales')

In [None]:
ross_df = pd.read_csv('/content/rossmann-store-sales/train.csv')

In [None]:
ross_df

In [None]:
store_df = pd.read_csv('/content/rossmann-store-sales/store.csv')

In [None]:
store_df

merge the two dataframes

In [None]:
merged_df = ross_df.merge(store_df, how='left', on='Store')

In [None]:
merged_df

In [None]:
merged_df.shape

In [None]:
test_df = pd.read_csv('/content/rossmann-store-sales/test.csv')

In [None]:
test_df

In [None]:
merged_test_df = test_df.merge(store_df, how='left', on='Store')

In [None]:
merged_test_df

# Cleaning Data

In [None]:
merged_df.info()

In [None]:
merged_df.describe()

In [None]:
round(merged_df.describe().T, 2)

In [None]:
int(merged_df.duplicated().sum())

parse the date column

In [None]:
merged_df['Date'] = pd.to_datetime(merged_df.Date)

In [None]:
merged_df['Date']

In [None]:
merged_test_df['Date'] = pd.to_datetime(merged_test_df.Date)

In [None]:
merged_test_df['Date']

In [None]:
merged_df.Date.min(), merged_df.Date.max()

In [None]:
merged_test_df.Date.min(), merged_test_df.Date.max()

# Exploratory Data Analysis

study the distribution of the target 'Sales' column

In [None]:
sns.histplot(data=merged_df, x='Sales')

why 0 sales in so many dates

In [None]:
merged_df.Open.value_counts()

In [None]:
merged_df.Sales.value_counts()[0]

exclude the dates where store was closed

In [None]:
merged_df = merged_df[merged_df.Open==1].copy()

In [None]:
sns.histplot(data=merged_df, x='Sales')

In [None]:
plt.figure(figsize=(18, 8))
temp_df = merged_df.sample(40000)
sns.scatterplot(x=temp_df.Sales, y=temp_df.Customers, hue=temp_df.Date.dt.year, alpha=0.8)
plt.title('Sales vs Customers')
plt.show()

In [None]:
plt.figure(figsize=(18, 8))
temp_df = merged_df.sample(40000)
sns.scatterplot(x=temp_df.Store, y=temp_df.Sales, hue=temp_df.Date.dt.year, alpha=0.8)
plt.title('Store vs Sales')
plt.show()

In [None]:
sns.barplot(data=merged_df, x='DayOfWeek', y='Sales')

In [None]:
sns.barplot(data=merged_df, x='Promo', y='Sales')

In [None]:
corr_sales = merged_df.select_dtypes(include='number').corr()['Sales']
corr_sales.sort_values(ascending=False)

# Feature Engineering

In [None]:
merged_df['Day'] = merged_df.Date.dt.day
merged_df['Month'] = merged_df.Date.dt.month
merged_df['Year'] = merged_df.Date.dt.year

In [None]:
merged_test_df['Day'] = merged_test_df.Date.dt.day
merged_test_df['Month'] = merged_test_df.Date.dt.month
merged_test_df['Year'] = merged_test_df.Date.dt.year

In [None]:
jovian.commit()

Train/Test/Validation Split

In [None]:
len(merged_df)

In [None]:
train_size = int(.75*len(merged_df))
train_size

In [None]:
sorted_df = merged_df.sort_values('Date')

In [None]:
train_df, val_df = sorted_df[:train_size], sorted_df[train_size:]

In [None]:
len(train_df), len(val_df)

In [None]:
train_df

In [None]:
train_df.Date.min(), train_df.Date.max()

In [None]:
val_df.Date.min(), val_df.Date.max()

In [None]:
merged_test_df.Date.min(), merged_test_df.Date.max()

In [None]:
train_df

In [None]:
train_df.columns

Input and target columns

In [None]:
input_cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'StoreType', 'Assortment', 'Day', 'Month', 'Year']

In [None]:
target_col = ['Sales']

separate numerical and categorical columns

In [None]:
merged_df[input_cols].nunique()

In [None]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

In [None]:
train_inputs

In [None]:
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()

In [None]:
test_inputs = merged_test_df[input_cols].copy()

In [None]:
numeric_cols = ['Store', 'Day', 'Month', 'Year']
categorical_cols = ['DayOfWeek', 'Promo', 'StateHoliday', 'StoreType', 'Assortment']

# Imputation, Scaling, Encode

impute missing data from numeric columns

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy='mean').fit(train_inputs[numeric_cols])

In [None]:
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])

scale the values to the (0, 1) range

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler().fit(train_inputs[numeric_cols])

In [None]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

encode categorical columns

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
train_inputs[categorical_cols] = train_inputs[categorical_cols].astype(str)
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(train_inputs[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

In [None]:
encoded_cols

In [None]:
for df in [train_inputs, val_inputs, test_inputs]:
    df[categorical_cols] = df[categorical_cols].astype(str)

train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

extract out the numeric data

In [None]:
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]


# Create quick and easy baseline models to benchmark future models

## Fixed/Random Guess

define a model that always returns the mean value of Sales as the prediction

In [None]:
def return_mean(inputs):
  return np.full(len(inputs), merged_df.Sales.mean())

In [None]:
train_preds = return_mean(X_train)

In [None]:
train_preds

evaluate this to using the RMSE score

In [None]:
from sklearn.metrics import root_mean_squared_error

In [None]:
root_mean_squared_error(train_preds, train_targets)

In [None]:
root_mean_squared_error(return_mean(X_val), val_targets)

makes a random guess between the highest and lowest sales

In [None]:
def guess_random(inputs):
  lo, hi = merged_df.Sales.min(), merged_df.Sales.max()
  return np.random.random(len(inputs)) * (hi - lo) + lo

In [None]:
train_preds = guess_random(X_train)
train_preds

In [None]:
root_mean_squared_error(train_preds, train_targets)

In [None]:
root_mean_squared_error(guess_random(X_val), val_targets)

# Baseline ML Model

train a simple LinearRegression model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linreg = LinearRegression()

In [None]:
linreg.fit(X_train, train_targets)

In [None]:
linreg.coef_

In [None]:
train_preds = linreg.predict(X_train)
train_preds

In [None]:
root_mean_squared_error(train_preds, train_targets)

In [None]:
val_preds = linreg.predict(X_val)
val_preds

In [None]:
root_mean_squared_error(val_preds, val_targets)

define a function try_model which takes a model then perform training and evaluation

In [None]:
def try_model(model):
  #fit the model
  model.fit(X_train, train_targets)

  #generate predictins
  train_preds = model.predict(X_train)
  val_preds = model.predict(X_val)

  #compute RMSE
  train_rmse = root_mean_squared_error(train_targets, train_preds)
  val_rmse = root_mean_squared_error(val_targets, val_preds)
  return train_rmse, val_rmse


# Linear Models

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor

In [None]:
try_model(LinearRegression())

In [None]:
try_model(Ridge())

In [None]:
try_model(Lasso())

In [None]:
try_model(ElasticNet())

In [None]:
try_model(SGDRegressor())

# Tree Based Models

In [None]:
from sklearn.tree import DecisionTreeRegressor, plot_tree

In [None]:
tree = DecisionTreeRegressor(random_state=42)

In [None]:
try_model(tree)

In [None]:
plt.figure(figsize=(40, 20))
plot_tree(tree, max_depth=3, filled=True, feature_names=numeric_cols+encoded_cols)

# Try a random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
%%time
rf=RandomForestRegressor(random_state=42, n_jobs=-1)
try_model(rf)

# Feature Importance

In [None]:
X_train.columns

In [None]:
rf.feature_importances_

In [None]:
importance_df = pd.DataFrame({
    'feature': numeric_cols+encoded_cols,
    'importance': rf.feature_importances_

}).sort_values('importance', ascending=False)
importance_df.head(10)

In [None]:
sns.barplot(data=importance_df.head(10), x='importance', y='feature')

# Looking at individual predictions

In [None]:
def predict_input(model, single_input):
    if single_input['Open'] == 0:
        return 0.
    input_df = pd.DataFrame([single_input])
    input_df['Date'] = pd.to_datetime(input_df.Date)
    input_df['Day'] = input_df.Date.dt.day
    input_df['Month'] = input_df.Date.dt.month
    input_df['Year'] = input_df.Date.dt.year
    #numeric preprocessing
    input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    #categorical preprocessing
    input_df[categorical_cols] = input_df[categorical_cols].fillna('None')
    input_df[categorical_cols] = input_df[categorical_cols].astype(str)
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    X_input = input_df[numeric_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    return pred

In [None]:
sample_input = {'Id': 1,
 'Store': 1,
 'DayOfWeek': 4,
 'Date': '2015-09-17 00:00:00',
 'Open': 1.0,
 'Promo': 1,
 'StateHoliday': '0',
 'SchoolHoliday': 0,
 'StoreType': 'c',
 'Assortment': 'a',
 'CompetitionDistance': 1270.0,
 'CompetitionOpenSinceMonth': 9.0,
 'CompetitionOpenSinceYear': 2008.0,
 'Promo2': 0,
 'Promo2SinceWeek': np.nan,
 'Promo2SinceYear': np.nan,
 'PromoInterval': np.nan}

sample_input

In [None]:
predict_input(rf, sample_input)

# Making a Kaggle Submission

In [None]:
test_preds = rf.predict(X_test)
test_preds

In [None]:
submission_df = pd.read_csv('/content/rossmann-store-sales/sample_submission.csv')

In [None]:
submission_df['Sales'] = test_preds

In [None]:
submission_df.fillna(0, inplace=True)

In [None]:
submission_df.to_csv('submission.csv', index=None)

In [None]:
!head submission.csv

In [None]:
from IPython.display import FileLink

In [None]:
FileLink('submission.csv')