# ML Project - Fall 2021  
---
Javad Hezare  
Ali Abbasi  
---
In this project, we are going to predict customers' behavior when clicking on an advertisement; whether they buy the product or not.

In [None]:
import pandas as pd
import numpy as np
# import category_encoders as ce
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

Importing dataset:

In [None]:
# df = pd.read_csv('https://github.com/a80-abbasi/ML_Project/blob/main/train_dataset.csv?raw=true')
df = pd.read_csv('train_dataset.csv')
df.head()

For better results, we seperate train set and validation set from the begining:

In [None]:
train_data, val_data = train_test_split(df, test_size=0.2)
train_data.shape

In [None]:
# for convenience
data = train_data

---
# 1. EDA, Data Cleaning and Feature Engineering

In this part we are going to get some insights about our data and find out what information each variable gives us and then, we start preparing our data for modeling by cleaning and deciding our policy for missing data. To do so we will follow bellow steps:
1. Understanding variables
2. Analyzing and Visualizing relationships between variables
3. Deciding what we should do with missing data
4. Choosing which features of data we are going to use to train our model

Inspecting datatypes of each feature:

In [None]:
data.dtypes

Inspecting count of each unique value in each cell:

In [None]:
from IPython.display import display
for col in data.columns:
    display(data[col].value_counts().to_frame())

_'click_timestamp'_ seem to contain only two dates:

In [None]:
data['click_timestamp'].apply(lambda x: x.split(' ')[0]).unique()

We were right. So we convert this column to 2 columns: one indicating the day and on indicating number of seconds from start of the day as integer values:

In [None]:
def get_seconds(time_str):
  _, hour = time_str.split(' ')
  t = list(map(int, hour.split(':')))
  return t[0] * 3600 + t[1] * 60 + t[2]

def get_day(time_str):
  date, _ = time_str.split(' ')
  return int(date.split('-')[-1])

data = data.merge(data['click_timestamp'].apply(lambda s: pd.Series({'click_day':get_day(s), 'click_second':get_seconds(s)})), left_index=True, right_index=True)

In [None]:
data.drop(columns=['click_timestamp'], errors='ignore', inplace=True)
data.head()

Splitting columns to 'numerical' and 'categorical' values:

In [None]:
target = 'Sale'
numerical_cols = ['click_second', 'click_day', 'nb_clicks_1week', 'product_price', 'SalesAmountInEuro', 'time_delay_for_conversion']
categorical_cols = list(set(data.columns) - set(numerical_cols) - set([target]))

Inspecting correlation matrix for numerical data:

In [None]:
data[[target] + numerical_cols].corr()

Three of columns seem suspicious. _'SalesAmountInEuro'_, _'time_delay_for_conversion'_ and _'product_price'_ have relatively high value of correlation with _'Sale'_ and seem to have valid values whenever _'Sale'_ is one and vice versa. So we'd better examine them more closely and decide wether we should retain them or not. If our doubt is right, we should delete those columns and train our model on other columns. Otherwise, our model would be trained only on these columns and most probably do very terribly on test set.

In [None]:
def print_metrics_evaluation(true_y, pred_y, model_name=None):
    if model_name is not None:
        print(f'{model_name}:')
    print(f'''accuracy_score = {accuracy_score(true_y, pred_y)}
precision_score = {precision_score(true_y, pred_y)}
recall_score = {recall_score(true_y, pred_y)}
f1_score = {f1_score(true_y, pred_y)}
    ''')

In [None]:
SalesAmountInEuro_predict = data['SalesAmountInEuro'] != -1
time_delay_for_conversion_predict = data['time_delay_for_conversion'] != -1
product_price_predict = data['product_price'] > 0
print_metrics_evaluation(data['Sale'], SalesAmountInEuro_predict, 'SalesAmountInEuro')
print_metrics_evaluation(data['Sale'], time_delay_for_conversion_predict, 'time_delay_for_conversion')
print_metrics_evaluation(product_price_predict, data['Sale'], 'product_price')

So for sure we must drop those columns. They are almost equivalent to label (Sale) and we can't use them in training. But we postpone judgment for _'product_price'_ to a short while later. 

In [None]:
data = data.drop(columns=['SalesAmountInEuro', 'time_delay_for_conversion'], errors='ignore')

def remove_from_list(lst, value):
    value = value if isinstance(value, list) else [value]
    return list(set(lst).difference(value))

numerical_cols = remove_from_list(numerical_cols, ['SalesAmountInEuro', 'time_delay_for_conversion'])

Describing numerical columns:

In [None]:
data[numerical_cols].describe()

As we can see the difference between 75% and max in _'nb_clicks_1week'_ is too much so there must exist some outlier records in this column and is _'product_price'_, missing data (in this casee 0) is dominating value.

In [None]:
data[categorical_cols].nunique(axis=0)

As we can see, product_category(7) contains only one value, -1. It means non of our records have product_category(7) and we can drop it.

In [None]:
data.drop(columns=['product_category(7)'], errors='ignore', inplace=True)
categorical_cols = remove_from_list(categorical_cols,'product_category(7)')

Describing categorical columns:

In [None]:
data[categorical_cols].describe()

Note that _'product_title'_ has nan values.  
We replace all invalid values (-1 and 0 in _'product_price'_) to nan.

In [None]:
data = data.replace([-1, '-1'], np.nan).replace({'product_price': 0}, np.nan)
data.head()

Examining count of not null values in each column:

In [None]:
data.count().to_frame().set_axis(['count'], axis=1)

In [None]:
# todo: converting values with low value_counts to 'other'

Now we map categorical values to integer values:

In [None]:
data[categorical_cols] = data[categorical_cols].astype('category')
column_categories = {}
for col in categorical_cols:
    column_categories[col] = data[col].cat.categories
    data[col] = data[col].cat.codes
# data[categorical_cols] = data[categorical_cols].astype('Int64')
data = data.replace(-1, np.nan)

Plotting boxplot for numerical values in order to find outliers:

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
for col, ax in zip(numerical_cols, axes.flatten()):
    ax.set_title(col)
    sns.boxplot(data=data, y=col, ax=ax)

Boxplots suggest a very small range for non-outlier values for nb_clicks_1week and product_price and if we remove outliers according to them, almost all of their value will be zero and we gain no information by using these features any more. So we choose to use 10000 and 2000 for their upper limits respectively.

In [None]:
nb_click_threshold = 10000
product_price_threshold = 2000
data.loc[(data['nb_clicks_1week'] > nb_click_threshold), 'nb_clicks_1week'] = np.nan
data.loc[(data['product_price'] > product_price_threshold), 'product_price'] = np.nan

Next, we drop rows and columns with too much NaNs:

In [None]:
col_threshold = 0.8
#Dropping categorical columns with missing value rate higher than threshold
dropping_columns = data[categorical_cols].columns[data[categorical_cols].isnull().mean() >= col_threshold]
categorical_cols = list(set(categorical_cols).difference(list(dropping_columns)))
data.drop(columns=dropping_columns, errors='ignore', inplace=True)

row_threshold = 0.6
# #Dropping rows with missing value rate higher than threshold
data = data.loc[data.isnull().mean(axis=1) < row_threshold]
data.shape

So this process droped $2$ columns and about $\frac{1}{4}$ of rows.  
When it comes to dealing with missing values, we have three options for replacing NaN values in categorical columns.  
1. Replacing them with value that have maximum frequency in that column.  
2. Replacing them with median of their columns.
3. Introducing a new category, i.e., 'other' for them.  


For second option, there should be an order between values and here we don't have any order (even in ordinal features like product_age_group, we can't deduce their order because of hashed strings). And because of very high missing rate in our dataset, first option doesn't seem very appropriate too.  
Hence, we are going to use a combination of option 1 and 3. We will set NaN values to value that have maximum frequency in that column and create another column which indicates if this is value was NaN initailly or not.

In [None]:
pd.options.mode.chained_assignment = None
clean_df = data[numerical_cols]
numerical_cols_medians = clean_df.median()
clean_df.fillna(numerical_cols_medians, inplace=True)
categorical_cols_idmaxes = {}
for col in categorical_cols:
    idxmax_value = data[col].mode()[0]
    categorical_cols_idmaxes[col] = idxmax_value
    clean_df[col] = data[col]
    clean_df[f'{col}_is_na'] = clean_df[col].isna() * 1
    clean_df[col].fillna(idxmax_value, inplace=True)
clean_df.head()

Normalizing data:

In [None]:
saved_mean = clean_df.mean()
saved_std = clean_df.std()
clean_df = (clean_df - saved_mean) / saved_std
clean_df

Some of 'is_na' columns had zero variance (i.e., only one unique value) and after normalization they became NaN. So we can drop them without loosing any information:  

In [None]:
nan_is_nan_cols = clean_df.columns[saved_std == 0]
clean_df.drop(columns=nan_is_nan_cols, errors='ignore', inplace=True)
clean_df

Generating final correlation matrix and its heatmap:

In [None]:
plt.figure(figsize = (15,15))
correlation_matrix = pd.concat([data[target], clean_df], axis=1).corr()
sns.heatmap(correlation_matrix)
correlation_matrix

We trained our model without dropping '_product_price_' once, and as we had predicted, we got f1 score almost equal to 1! Which is irrational and this time we we'll try training our model without this column:

In [None]:
clean_df.drop(columns=['product_price'], errors='ignore', inplace=True)

For more convenience, we define a method that gets a dataframe and performs all above steps on it. This function will be helpfull for validating our validation (and of course, test) set.

In [None]:
def data_preprocess(data, dropping_columns, numerical_median, categories_mode, saved_mean, saved_std, ce):

    # convert 'click_timestamp' to ['click_day', 'click_second']
    data['click_day'] = data['click_timestamp'].apply(get_day)
    data['click_second'] = data['click_timestamp'].apply(get_seconds)
    data.drop(columns=['click_timestamp'], errors='ignore', inplace=True)

    # drop columns
    data = data.drop(columns=dropping_columns, errors='ignore')

    # replace missing values and outliers with nan
    data = data.replace([-1, '-1'], np.nan).replace({'product_price': 0}, np.nan)
    data.loc[(data['nb_clicks_1week'] > nb_click_threshold), 'nb_clicks_1week'] = np.nan
    # data.loc[(data['product_price'] > product_price_threshold), 'product_price'] = np.nan
    
    # now lets fill nan
    target = 'Sale'
    numerical_cols = data.select_dtypes(include='number').drop(columns=[target, 'click_day', 'click_second']).columns.tolist()
    categorical_cols = data.select_dtypes(exclude='number').columns.tolist()
    
    clean_df = data.drop(columns=[target])
    
    # fill nan of numerical columns
    clean_df[numerical_cols] = clean_df[numerical_cols].fillna(numerical_median)
    
    # fill nan of categorical columns
    for col in categorical_cols:
        clean_df[f'{col}_is_na'] = data[col].isna() * 1
        clean_df[col].fillna(categories_mode[col], inplace=True)

    # encode categories
    clean_df[categorical_cols] = ce.transform(data[categorical_cols])
        
    # normalize numerical features
    clean_df[numerical_cols] = (clean_df[numerical_cols] - saved_mean) / saved_std
    
    # add target column to dataframe
    clean_df[target] = data[target]

    return clean_df


# 2. Model Training

Now we will define different models and train them with our training set and evaluate our model with validation set.

## 2.1 Neural Network Model

We will define and train a neural network model using pytorch as we learned during the course and evaluate metrics (especially $F1$ score) on it.  
We will track all of our hyperparameters, metric values, learning curves and ... with mlflow in every run.

In [None]:
# !pip install mlflow
# import required libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import tqdm
import mlflow
import mlflow.pytorch
import pickle

from typing import Tuple
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
# for using GPU if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

We define a simple neural network model consisting multiple Linear layers, ReLU activation functions and Dropout for preventing overfitting on training data.  

In [None]:
class SalePrediction(nn.Module):

    def __init__(self, input_size):
        super().__init__()
        #input
        self.input_size = input_size
        self.dropout = nn.Dropout(p=0.6)
        self.lnn = nn.Sequential(
            nn.Linear(self.input_size, 256),
            nn.ReLU(),
            self.dropout,
            nn.Linear(256, 128),
            nn.ReLU(),
            self.dropout,
            nn.Linear(128, 64),
            nn.ReLU(),
            self.dropout,
            nn.Linear(64, 32),
            nn.ReLU(),
            self.dropout,
            nn.Linear(32, 1)
        )
        
    
    def forward(self, x: torch.tensor):
        x = x.view(-1, self.input_size)
        return self.lnn(x)

Defining a simple Dataset class for our data:

In [None]:
class SalePredictionDataset(Dataset):
    def __init__(self, X: np.ndarray, Y: np.ndarray):
        self.X = torch.from_numpy(X)
        self.Y = torch.from_numpy(Y)

    def __len__(self) -> int:
        return self.X.shape[0]

    def __getitem__(self, i: int) -> Tuple[torch.Tensor, ...]:
        return self.X[i], self.Y[i]

Creating Dataset and Dataloader from our training and validation data:

In [None]:
Y_train = data[target].to_numpy(dtype='float32')
X_train = clean_df.to_numpy(dtype='float32')

train_set = SalePredictionDataset(X_train, Y_train)
# train_size = int(0.8 * len(data_set))
# val_size = len(data_set) - train_size
# train_set, val_set = torch.utils.data.random_split(data_set, (train_size, val_size))
processed_val_data = data_preprocess(val_data)
Y_val = val_data[target].to_numpy(dtype='float32')
X_val = processed_val_data.to_numpy(dtype='float32')
val_set = SalePredictionDataset(X_val, Y_val)

train_loader = DataLoader(dataset=train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=val_set, batch_size=32, shuffle=False)

In [None]:
def train(model, criterion, optimizer, epoch):
    train_loss = 0
    N_train = len(train_loader.dataset)

    model.train()
    with tqdm.tqdm(enumerate(train_loader), total=len(train_loader)) as pbar:
        for i, (x, y) in pbar:
            optimizer.zero_grad()

            x = x.to(device)
            y = y.view(-1, 1).to(device)
            p = model(x)

            loss = criterion(p, y)
            train_loss += loss.item() * len(x)

            pbar.set_description(f'Epoch:{epoch}, Train Loss: {train_loss / N_train:.3e}')
            
            loss.backward()
            optimizer.step()
    
    train_loss /= N_train
    mlflow.log_metric('train_loss', train_loss)
    return train_loss


def validate(model, criterion, epoch):
    val_loss = 0
    N_val = len(val_loader.dataset)

    model.eval()
    with torch.no_grad(), tqdm.tqdm(enumerate(val_loader), total=len(val_loader)) as pbar:
        for i, (x, y) in pbar:
            x = x.to(device)
            y = y.view(-1, 1).to(device)

            p = model(x)
            loss = criterion(p, y)
            val_loss += loss.item() * len(x)

            pbar.set_description(f'Epoch:{epoch}, Val Loss: {val_loss / N_val:.3e}')
    
    print('-------------------------------------------------------------------')
    val_loss /= N_val
    mlflow.log_metric('val_loss', val_loss)
    return val_loss

Training model, tracking and logging hyperparameters and artifacts:

In [None]:
def do_expriment(lr, num_epochs):
    model = SalePrediction(clean_df.shape[1]).to(device)
    criterion = nn.BCEWithLogitsLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    train_loss_arr, val_loss_arr = np.zeros(num_epochs), np.zeros(num_epochs)

    with mlflow.start_run():
        mlflow.log_param('learning_rate', lr)
        mlflow.log_param('num_epochs', num_epochs)
        
        val_loss_min = float('inf')

        for epoch in range(num_epochs):
            train_loss = train(model, criterion, optimizer, epoch)
            val_loss = validate(model, criterion, epoch)

            train_loss_arr[epoch] = train_loss
            val_loss_arr[epoch] = val_loss

            if val_loss <= val_loss_min:
                torch.save(model.state_dict(), 'NNModel1.pt')
                val_loss_min = val_loss

        # load best model during different epochs
        model.load_state_dict(torch.load('NNModel1.pt'))

        # log trained model
        print("\nLogging the trained model as a run artifact...")
        mlflow.pytorch.log_model(model, artifact_path="pytorch-model", pickle_module=pickle)
        print('Logging the trained model is done')

        # metrics
        X_train, Y_train = train_set[:]
        X_val, Y_val = val_set[:]

        train_preds = model(X_train.to(device)).view(-1) >= 0
        val_preds = model(X_val.to(device)).view(-1) >= 0

        mlflow.log_metric('Train Precision', precision_score(Y_train, train_preds.cpu(), average='macro'))
        mlflow.log_metric('Train Recall' ,recall_score(Y_train, train_preds.cpu(), average='macro'))
        mlflow.log_metric('Train F1Score', f1_score(Y_train, train_preds.cpu()))

        mlflow.log_metric('val Precision', precision_score(Y_val, val_preds.cpu(), average='macro'))
        mlflow.log_metric('val Recall', recall_score(Y_val, val_preds.cpu(), average='macro'))
        mlflow.log_metric('val F1Score', f1_score(Y_val, val_preds.cpu()))

        return train_loss_arr, val_loss_arr, model

In [None]:
train_loss_arr, val_loss_arr, model = do_expriment(lr=1e-3, num_epochs=50)

In [None]:
# !zip -r mlruns.zip ./mlruns/
# !rm -rf mlruns

In [None]:
plt.plot(train_loss_arr, label='train')
plt.plot(val_loss_arr, label='val')
plt.legend();

In [None]:
# load the best model during epochs according to validation loss
model.load_state_dict(torch.load('NNModel1.pt'))
model = model.to('cpu')

In [None]:
print_metrics_evaluation(Y_train.view(-1).detach(), model(X_train).view(-1).detach() >= 0, 'Metrics on Training Data')
print_metrics_evaluation(Y_val.view(-1).detach(), model(X_val).view(-1).detach() >= 0, 'Metrics on Validation Data')

## 2.2 XGBoost

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [None]:
Y = data[target].to_numpy(dtype='float32')
X = clean_df.drop(columns=['product_price']).to_numpy(dtype='float32')
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2)

D_train = xgb.DMatrix(X_train, label=Y_train)
D_val = xgb.DMatrix(X_val, label=Y_val)

In [None]:
param = {
    'eta': 0.3,
    'max_depth': 4,
    'objective': 'binary:logitraw'
}

step = 20

In [None]:
model = xgb.train(param, D_train, step)

In [None]:
preds = model.predict(D_val)
preds = preds > 0

# best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision = {}".format(precision_score(Y_val, preds, average='macro')))
print("Recall = {}".format(recall_score(Y_val, preds, average='macro')))
print("Accuracy = {}".format(f1_score(Y_val, preds)))

# 3. Deployment

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class Preproccess(BaseEstimator, TransformerMixin):
    def __init__(self, func):
        self.func = func
        self.nb_click_threshold = 10000
        self.product_price_threshold = 2000
        self.col_threshold = 0.8
        self.row_threshold = 0.6

    def fit(self, df, y=None):
        print('-----[start fitting preproccessore transformer to data]-----')
        
        # replace missing values with nan
        df = df.replace([-1, '-1'], np.nan).replace({'product_price': 0}, np.nan)
        df.loc[(df['nb_clicks_1week'] > self.nb_click_threshold), 'nb_clicks_1week'] = np.nan
        # df.loc[(df['product_price'] > self.product_price_threshold), 'product_price'] = np.nan

        # store those columns we want to drop
        categorical_cols = df.select_dtypes(exclude='number').columns.tolist()
        dropping_columns = df[categorical_cols].columns[df[categorical_cols].isnull().mean() >= self.col_threshold].tolist()
        dropping_columns += (['SalesAmountInEuro', 'time_delay_for_conversion', 'click_timestamp', 'product_category(7)', 'product_price'])

        # drop rows from df
        df = df.loc[df.isnull().mean(axis=1) < self.row_threshold]

        # copy df to store mean, mode, std, and ... of data to use in transform()
        copy_df = df.drop(columns=dropping_columns)

        target = 'Sale'
        numerical_cols = copy_df.select_dtypes(include='number').drop(columns=[target]).columns.tolist()
        categorical_cols = copy_df.select_dtypes(exclude='number').columns.tolist()

        # numerical columns median and categorical columns mode
        numerical_median = copy_df[numerical_cols].median()
        categories_mode = dict(copy_df[categorical_cols].mode().loc[0])

        # save a TargetEncoder for encoding categories
        ce = MyTargetEncoder(target, categorical_cols)
        copy_df[categorical_cols] = ce.fit_transform(copy_df)
        
        # save mean and std of columns
        saved_mean = copy_df[numerical_cols].mean()
        saved_std = copy_df[numerical_cols].std()

        # drop those columns with std = 0
        dropping_columns += copy_df[numerical_cols].columns[saved_std == 0].tolist()


        # saving parameters to use in transform
        self.ce = ce
        self.dropping_columns = dropping_columns
        self.numerical_median = numerical_median
        self.categories_mode = categories_mode
        self.saved_mean = saved_mean
        self.saved_std = saved_std
        
        print('-----[fitting transformer to data is done!]-----')
        return self

    def transform(self, df, y=None):
        return self.func(df, self.dropping_columns, self.numerical_median, self.categories_mode, self.saved_mean, self.saved_std, self.ce)


class MyTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, target, columns):
        self.columns = columns
        self.target = target
        self.encoders = {}

    def fit(self, df, y=None):
        for col in self.columns:
            ce = TargetEncoder()
            ce.fit(df[col], df[self.target])

            self.encoders[col] = ce

        return self
    
    def transform(self, df, y=None):
        for col in self.columns:
            df[col] = self.encoders[col].transform(df[col])
        return df[self.columns]


class Model(BaseEstimator, TransformerMixin):
    def __init__(self, nn_type, target):
        self.nn_type = nn_type
        self.target = target
    
    def fit(self, df, y=None):
        _, _, self.model = do_experiment(self.nn_type, df, lr=1e-4, num_epochs=2)
        return self

    def transform(self, df, y=None):
        X = torch.from_numpy(df.drop(columns=[target]).to_numpy(dtype=np.float32)).to(device)
        return self.model(X)
        

In [None]:
data = pd.read_csv('train_dataset.csv')

ml_pipeline = Pipeline([
    ('preproccess', Preproccess(data_preprocess)),
    ('model', Model(SalePrediction, 'Sale'))
])

ml_pipeline.fit_transform(data)
# ml_pipeline.transform(data)

In [None]:
!mlflow models build-docker \
  -m "C:\Users\LEGION\Desktop\MLproj\testing\mlruns\0\cc7b702a14a240f69cdcbbb0d25fa8c4\artifacts\pytorch-model" \
  -n "my-docker-image" \
  --enable-mlserver

In [None]:
train_arr, val_arr, model = do_experiment(lr=1e-4, num_epochs=2)