# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
  accuracy_score, 
  precision_score, 
  recall_score, 
)
from sklearn.ensemble import (
  VotingClassifier,
  BaggingClassifier,
  StackingClassifier,
  RandomForestClassifier
)
from tqdm.notebook import tqdm
import joblib

In [197]:
RANDOM_STATE = 21
TEST_SIZE = 0.2

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


## FeatureExtractor Transformer

In [198]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass
  
  def fit(self, X, y=None):
    return self
  
  def transform(self, X):
    X = X.copy()
    X['timestamp'] = pd.to_datetime(X['timestamp'])
    X['hour'] = X['timestamp'].dt.hour
    X['dayofweek'] = X['timestamp'].dt.weekday
    return X.drop('timestamp', axis=1)

## MyOneHotEncoder Transformer

In [199]:
class MyOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, target_column):
        self.target_column = target_column
        self.encoder = None
        self.cat_features = None

    def fit(self, X, y=None):
        X = X.copy()
        # identify categorical features, except the target
        self.cat_features = [
            col for col in X.columns 
            if X[col].dtype == 'object' and col != self.target_column
        ]
        if self.cat_features:
            self.encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
            self.encoder.fit(X[self.cat_features])
        return self

    def transform(self, X):
        X = X.copy()
        y = X[self.target_column]  # separating target

        if self.cat_features:
            encoded = self.encoder.transform(X[self.cat_features])
            encoded_df = pd.DataFrame(
                encoded, 
                columns=self.encoder.get_feature_names_out(self.cat_features),
                index=X.index
            )
            X = pd.concat([X.drop(columns=self.cat_features), encoded_df], axis=1)

        return X.drop(columns=[self.target_column]), y

## TrainValidationTest Transformer

In [200]:
class TrainValidationTest:
    def __init__(self, test_size=TEST_SIZE, valid_size=0.25, random_state=RANDOM_STATE):
        self.test_size = test_size
        self.valid_size = valid_size
        self.random_state = random_state

    def split(self, X, y):
        # First get train/test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=self.test_size,
            stratify=y,
            random_state=self.random_state
        )
        # Then divide train/valid
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train, y_train,
            test_size=self.valid_size,
            stratify=y_train,
            random_state=self.random_state
        )
        return X_train, X_valid, X_test, y_train, y_valid, y_test

## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.877778
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.866667
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.907407
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [201]:
class ModelSelection:
    def __init__(self, grids, grid_dict):
        """
        grids: list of GridSearchCV instances
        grid_dict: dict {index: model_name}
        """
        self.grids = grids
        self.grid_dict = grid_dict
        self.results = []

    def choose(self, X_train, y_train, X_valid, y_valid):
        best_score = -1
        best_model_name = None

        for i, gs in enumerate(self.grids):
            model_name = self.grid_dict[i]

            print(f"\nEstimator: {model_name}")

            for _ in tqdm(gs.param_grid, desc=f"Tuning {model_name}"):
                pass

            gs.fit(X_train, y_train)

            best_params = gs.best_params_
            best_train_acc = gs.best_score_

            y_pred = gs.best_estimator_.predict(X_valid)
            valid_acc = accuracy_score(y_valid, y_pred)

            print(f"Best params: {best_params}")
            print(f"Best training accuracy: {best_train_acc:.3f}")
            print(f"Validation set accuracy score for best params: {valid_acc:.3f}\n")

            self.results.append({
                'model': model_name,
                'params': best_params,
                'valid_score': valid_acc
            })

            if valid_acc > best_score:
                best_score = valid_acc
                best_model_name = model_name

        print(f"\nClassifier with best validation set accuracy: {best_model_name}")
        return best_model_name

    def best_results(self):
        """Return results in DataFrame"""
        return pd.DataFrame(self.results)

## 3. Finalization

  `Finalize()` class
  - Takes an estimator.
  - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
  ```
  final.final_score(X_train, y_train, X_test, y_test)
  Accuracy of the final model is 0.908284023668639
  ```
  - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [202]:
class Finalize:
    def __init__(self, estimator):
        """
        estimator: trained classifier or pipeline
        """
        self.estimator = estimator

    def final_score(self, X_train, y_train, X_test, y_test):
        self.estimator.fit(X_train, y_train)
        y_pred = self.estimator.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"Accuracy of the final model is {acc:.5f}")
        return acc

    def save_model(self, path):
        joblib.dump(self.estimator, path)
        print(f"Model was successfully saved to {path}")

## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

## Load the Data

In [203]:
NAME_OF_FILE = '../data/checker_submits.csv'

In [204]:
df = pd.read_csv(NAME_OF_FILE)
df

Unnamed: 0,uid,labname,numTrials,timestamp
0,user_4,project1,1,2020-04-17 05:19:02.744528
1,user_4,project1,2,2020-04-17 05:22:45.549397
2,user_4,project1,3,2020-04-17 05:34:24.422370
3,user_4,project1,4,2020-04-17 05:43:27.773992
4,user_4,project1,5,2020-04-17 05:46:32.275104
...,...,...,...,...
1681,user_19,laba06s,9,2020-05-21 20:01:48.959966
1682,user_1,laba06s,6,2020-05-21 20:18:54.487900
1683,user_1,laba06s,7,2020-05-21 20:19:06.872761
1684,user_1,laba06s,8,2020-05-21 20:22:41.877806


## Preprocessing Pipeline

In [205]:
preprocessing = Pipeline([
  ('feature_extractor', FeatureExtractor()),
  ('onehot_encoder', MyOneHotEncoder(target_column='dayofweek'))
])

## Fit & Transform

In [206]:
X, y = preprocessing.fit_transform(df)
X

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [207]:
y

0       4
1       4
2       4
3       4
4       4
       ..
1681    3
1682    3
1683    3
1684    3
1685    3
Name: dayofweek, Length: 1686, dtype: int32

## Split Train/Valid/Test sets

In [208]:
splitter = TrainValidationTest()
X_train, X_valid, X_test, y_train, y_valid, y_test = splitter.split(X, y)

## Model Selection

In [209]:
## models parametes
svm_params = [{
  'kernel': ('linear', 'rbf', 'sigmoid'),
  'C': [0.01, 0.1, 1, 5, 10],
  'gamma': ['scale', 'auto'],
  'class_weight': ('balanced', None),
  'random_state': [RANDOM_STATE],
  'probability': [True]
}]

tree_params = [{
  'criterion': ['gini', 'entropy'],
  'max_depth': [10, 15, 20, 22],
  'class_weight': ('balanced', None),
  'random_state': [RANDOM_STATE]
}]

rf_params = [{
  'criterion': ['gini', 'entropy'],
  'max_depth': [10, 15, 20, 22],
  'n_estimators': [50, 100],
  'class_weight': ('balanced', None),
  'random_state': [RANDOM_STATE]
}]

In [210]:
# GredSearch for every model
gs_svm = GridSearchCV(SVC(), svm_params, scoring='accuracy', cv=2, n_jobs=-1)
gs_tree = GridSearchCV(DecisionTreeClassifier(), tree_params, scoring='accuracy', cv=2, n_jobs=-1)
gs_rf = GridSearchCV(RandomForestClassifier(), rf_params, scoring='accuracy', cv=2, n_jobs=-1)


grids = [gs_svm, gs_tree, gs_rf]
grid_dict = {0: 'SVM', 1: 'Decision Tree', 2: 'Random Forest'}

selector = ModelSelection(grids, grid_dict)
best_model_name = selector.choose(X_train, y_train, X_valid, y_valid)


df_results = selector.best_results()
print("\n=== Best results ===")
display(df_results)


Estimator: SVM


Tuning SVM:   0%|          | 0/1 [00:00<?, ?it/s]

Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.752
Validation set accuracy score for best params: 0.855


Estimator: Decision Tree


Tuning Decision Tree:   0%|          | 0/1 [00:00<?, ?it/s]

Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 20, 'random_state': 21}
Best training accuracy: 0.802
Validation set accuracy score for best params: 0.869


Estimator: Random Forest


Tuning Random Forest:   0%|          | 0/1 [00:00<?, ?it/s]

Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 22, 'n_estimators': 100, 'random_state': 21}
Best training accuracy: 0.856
Validation set accuracy score for best params: 0.884


Classifier with best validation set accuracy: Random Forest

=== Best results ===


Unnamed: 0,model,params,valid_score
0,SVM,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.854599
1,Decision Tree,"{'class_weight': None, 'criterion': 'gini', 'm...",0.869436
2,Random Forest,"{'class_weight': None, 'criterion': 'gini', 'm...",0.884273


## Finalize best model

In [213]:
best_row = df_results[df_results['model'] == best_model_name]
best_index = list(grid_dict.keys())[list(grid_dict.values()).index(best_model_name)]
best_estimator = grids[best_index].best_estimator_

final = Finalize(best_estimator)
acc = final.final_score(X_train, y_train, X_test, y_test)

Accuracy of the final model is 0.90533


In [215]:
filename = f"../data/{best_model_name.replace(' ', '_')}_{round(acc,5)}.sav"
final.save_model(filename)

Model was successfully saved to ../data/Random_Forest_0.90533.sav
