# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import (
  train_test_split,
  GridSearchCV,
  ParameterGrid,
)

from sklearn.base import (
  BaseEstimator,
  TransformerMixin,
)

from sklearn.preprocessing import (
  OneHotEncoder
)

from sklearn.pipeline import (
  Pipeline,
)

from sklearn.tree import (
  DecisionTreeClassifier,
)

from sklearn.svm import (
  SVC
)

from sklearn.metrics import (
  accuracy_score,
)


from sklearn.ensemble import (
  RandomForestClassifier,
)

from typing import (
  List,
)

from tqdm.notebook import tqdm

import joblib

from contextlib import contextmanager

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [231]:
class FeatureExtractor(TransformerMixin, BaseEstimator):
  def __init__(self):
    pass

  def fit(self, X=None, y=None):
    return self
    
  def transform(self, X=None, y=None):
    df = X.copy()
    df['hour'] = df['timestamp'].dt.hour
    df['dayofweek'] = df['timestamp'].dt.weekday
    return df.drop('timestamp', axis=1)

In [232]:
class MyOneHotEncoder(OneHotEncoder):
  def __init__(self,target_column: str):
    super().__init__(sparse=False)
    self.target_column = target_column

  def _select_categories(self, X : pd.DataFrame):
    categories = X.select_dtypes(include=['category', 'object'])
    if self.target_column in categories.columns:
      categories = categories.drop(self.target_column, axis=1)
    return categories
  
  def fit(self, X, y=None):
    subset = self._select_categories(X.copy())
    return super().fit(subset)

  def transform(self, X):
    df = X.copy()
    target = df[self.target_column]
    df.drop(self.target_column, axis=1, inplace=True)

    subset = self._select_categories(df)
    encoded_df = pd.DataFrame(super().transform(subset), columns=self.get_feature_names(input_features=subset.columns))

    return pd.concat([df.drop(subset.columns, axis=1), encoded_df], axis=1), target
  

In [233]:
class TraninValidationTest():
  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __iter__(self):
    X_train, X_test, y_train, y_test = train_test_split(
      self.X,
      self.y, 
      test_size=0.2,
      random_state=21, 
      stratify=self.y
    )
    X_train, X_valid, y_train, y_valid = train_test_split(
      X_train,
      y_train, 
      test_size=0.2,
      random_state=21, 
      stratify=y_train
    )

    return iter((X_train, X_valid, X_test, y_train, y_valid, y_test))

## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.772727
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.801484
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.855288
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [234]:
class ModelSelection():
  def __init__(self, grids : List[GridSearchCV], grid_dict):
    self.grids = grids
    self.grid_dict = grid_dict
    self.results = []

  @contextmanager
  def _tqdm_joblib(self,tqdm_object):
      class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
          def __init__(self, *args, **kwargs):
              super().__init__(*args, **kwargs)

          def __call__(self, *args, **kwargs):
              tqdm_object.update(n=self.batch_size)
              return super().__call__(*args, **kwargs)

      old_callback = joblib.parallel.BatchCompletionCallBack
      joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
      try:
          yield
      finally:
          joblib.parallel.BatchCompletionCallBack = old_callback

  def _custom_search(self, X_train, X_valid, y_train, y_valid):
    for grid in self.grids:
      grid_idx = self.grids.index(grid)
      params = self.grid_dict[grid_idx]

      total_tasks = len(ParameterGrid(params)) * (grid.cv if grid.cv else 5)
      estimator_name = grid.get_params(deep=False)['estimator'].__class__.__name__
      with tqdm(total=total_tasks, desc=f'Estimator: {estimator_name}', leave=True) as pbar:
          with self._tqdm_joblib(pbar):
              grid.fit(X_train, y_train)

      best_score = grid.best_score_
      best_params = grid.best_params_
      self.results.append((grid_idx, best_params, best_score))

      print(f"Best params: {best_params}\n"
            f"Best training accuracy: {best_score:.3f}\n"
            f"Validation set accuracy score for best params: {accuracy_score(y_valid, grid.predict(X_valid)):.3f}\n") 

  def choose(self, X_train, X_valid, y_train, y_valid):
    if not self.results:
      self._custom_search(X_train, X_valid, y_train, y_valid)

    results = [res[2] for res in self.results]
    best_estimator = self.grids[np.argmax(results)].get_params(deep=False)['estimator'].__class__.__name__

    print(f'Classifier with best validation set accuracy: {best_estimator}')

    return best_estimator

  def best_results(self):
    models = [
      self.grids[grid[0]].get_params(deep=False)['estimator'].__class__.__name__
      for grid 
      in self.results
    ]

    result = pd.DataFrame(self.results, columns=['model','params', 'valid_score'])
    result['model'] = models
    result.sort_values('valid_score', ascending=False, inplace=True)

    return result


## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [235]:
class Finalize():
  def __init__(self, estimator):
    self.estimator = estimator
  
  def final_score(self, X_train, y_train, X_test, y_test):
    self.estimator.fit(X_train, y_train)
    score = accuracy_score(y_test, self.estimator.predict(X_test))

    print(f'Accuracy of the final moddel is {score}')
    return score
  
  def save_model(self, path):
    res = joblib.dump(self.estimator, path)

    print(f'model {"saved successfully" if len(res) == 1 else "NOT SAVED"}')

## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [236]:
df = pd.read_csv('../../datasets/checker_submits.csv', parse_dates=['timestamp'])
df

Unnamed: 0,uid,labname,numTrials,timestamp
0,user_4,project1,1,2020-04-17 05:19:02.744528
1,user_4,project1,2,2020-04-17 05:22:45.549397
2,user_4,project1,3,2020-04-17 05:34:24.422370
3,user_4,project1,4,2020-04-17 05:43:27.773992
4,user_4,project1,5,2020-04-17 05:46:32.275104
...,...,...,...,...
1681,user_19,laba06s,9,2020-05-21 20:01:48.959966
1682,user_1,laba06s,6,2020-05-21 20:18:54.487900
1683,user_1,laba06s,7,2020-05-21 20:19:06.872761
1684,user_1,laba06s,8,2020-05-21 20:22:41.877806


In [237]:
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])

In [238]:
data = preprocessing.fit_transform(df)

In [239]:
X_train, X_valid, X_test, y_train, y_valid, y_test = TraninValidationTest(*data)

In [240]:
svm_params = dict(
    kernel=['linear', 'rbf', 'sigmoid'],
    C=[0.01, 0.1, 1, 1.5, 5, 10],
    gamma=['scale', 'auto'],
    class_weight=['balanced', None],
    random_state=[21],
    probability=[True]
)

tree_params = dict(
    max_depth=range(1, 49),
    class_weight=['balanced', None],
    criterion=['gini', 'entropy'],
    random_state=[21]
)
forest_params =  dict(
    n_estimators=[5, 10, 50, 100],
    max_depth=range(1, 49),
    class_weight=['balanced', None],
    criterion=['entropy','gini'],
    random_state=[21]
)

gs_svm = GridSearchCV(
  SVC(),
  svm_params,
  scoring='accuracy',
  n_jobs=-1,
  verbose=0,
)
gs_tree = GridSearchCV(
  DecisionTreeClassifier(),
  tree_params,
  scoring='accuracy',
  n_jobs=-1,
  verbose=0,
  
)
gs_forest = GridSearchCV(
  RandomForestClassifier(),
  forest_params,
  scoring='accuracy',
  n_jobs=-1,
  verbose=0,
)

grids = [
  gs_svm,
  gs_tree,
  gs_forest
]

grid_dict = {
  grids.index(gs_svm): svm_params,
  grids.index(gs_tree): tree_params,
  grids.index(gs_forest): forest_params
}

In [241]:
selector = ModelSelection(grids, grid_dict)

In [242]:
best_model_name = selector.choose(X_train,X_valid, y_train, y_valid);

Estimator: SVC:   0%|          | 0/360 [00:00<?, ?it/s]

Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.842
Validation set accuracy score for best params: 0.878



Estimator: DecisionTreeClassifier:   0%|          | 0/960 [00:00<?, ?it/s]

Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 26, 'random_state': 21}
Best training accuracy: 0.853
Validation set accuracy score for best params: 0.867



Estimator: RandomForestClassifier:   0%|          | 0/3840 [00:00<?, ?it/s]

Best params: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 35, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.898
Validation set accuracy score for best params: 0.904

Classifier with best validation set accuracy: RandomForestClassifier


In [243]:
best_results = selector.best_results()
best_results

Unnamed: 0,model,params,valid_score
2,RandomForestClassifier,"{'class_weight': 'balanced', 'criterion': 'ent...",0.897916
1,DecisionTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",0.853389
0,SVC,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.842261


In [244]:
models_dict = {
  'SVC': SVC,
  'DecisionTreeClassifier': DecisionTreeClassifier,
  'RandomForestClassifier': RandomForestClassifier,
}

In [245]:
best_model = models_dict[best_model_name](**best_results.set_index('model').loc[best_model_name,'params'])
best_model

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=35, n_estimators=50, random_state=21)

In [246]:
final = Finalize(best_model)

In [247]:
accuracy = final.final_score(X_train, y_train, X_test, y_test)

Accuracy of the final moddel is 0.9171597633136095


In [248]:
final.save_model(f'name_of_the_model_{accuracy}.sav')

model saved successfully
