# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from tqdm.notebook import tqdm
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score
import joblib
from sklearn.base import TransformerMixin

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [2]:
class FeatureExtractor(TransformerMixin):

    def fit(self, df, y):
        self.df = df
        self.hour = self.df['timestamp'].dt.hour
        self.weekday = self.df['timestamp'].dt.weekday
        return self

    def transform(self, df):
        self.df['hour'] = self.hour
        self.df['weekday'] = self.weekday
        return self.df.drop(axis = 1, labels = ['timestamp'])

In [3]:
class MyOneHotEncoder(TransformerMixin):

    def __init__(self, target_name):
        self.target_name = target_name

    def fit(self, df, y):
        self.cat_f_list = list(set([c_name for c_name in df.columns if df[c_name].dtype == 'O']) - {self.target_name})
        self.df = df
        return self
        
    def transform(self, df):
        ohe = OneHotEncoder(sparse_output=False)
        r = ohe.fit_transform(df[self.cat_f_list])
        return pd.concat([df, pd.DataFrame(r, columns=ohe.get_feature_names_out(self.cat_f_list))], axis=1).drop(axis=1, labels=self.cat_f_list)


In [4]:
class TrainValidationTest(TransformerMixin):
    def fit(self, X, y):
        self.X = X
        self.y = y
        return self
    
    def transform(self, x):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=21, stratify=self.y)
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=21, stratify=y_train)
        return X_train, X_valid, X_test, y_train, y_valid, y_test

## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.772727
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.801484
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.855288
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [5]:
class tqdmGridSearchCV(GridSearchCV):
    def _run_search(self, evaluate_candidates):
        """Search all candidates in param_grid"""
        par = ParameterGrid(self.param_grid)
        for i in tqdm(par):
            evaluate_candidates([i])

In [14]:
class ModelSelection():
    def __init__(self, grids, grid_dict):
        self.grids = grids
        self.grid_dict = grid_dict
        self.choose_flag = False
    
    def choose(self, X_train, y_train, X_valid, y_valid):
        self.choose_flag = True
        best_val_acc = 0
        for num, gs in enumerate(self.grids):
            gs.__class__ = tqdmGridSearchCV
            print("Estimator:", self.grid_dict[num])
            gs.fit(X_train, y_train)
            print("Best params:", gs.best_params_)
            print("Best training accuracy: {:.3f}".format(gs.best_score_))
            val_score = accuracy_score(y_true=y_valid, y_pred=gs.best_estimator_.predict(X_valid))
            gs.val_score = val_score
            if val_score > best_val_acc:
                 best_est = self.grid_dict[num]
                 best_est_model = gs.best_estimator_
                 best_est_model.name = best_est
            print("Validation set accuracy score for best params: {:.3f}".format(val_score))
            print()
        print("Classifier with best validation set accuracy:", best_est)    
        print()
        return best_est_model
    
    def best_results(self):
        if self.choose_flag == False:
            print("You need to run choose() first") 
        else:
            model_list = [self.grid_dict[i] for i in range(0, len(self.grids))]
            params_list = [gs.best_params_ for gs in self.grids]
            val_list = [gs.val_score for gs in self.grids]

            return pd.DataFrame({'model':model_list,'params': params_list, 'valid_score':val_list})


## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [19]:
class Finalize():
    def __init__(self, estimator):
        self.estimator = estimator
    
    def final_score(self, X_train, y_train, X_test, y_test):
        self.estimator.fit(X_train, y_train)
        print("Accuracy of the final model is", acc := accuracy_score(y_true=y_test, y_pred=self.estimator.predict(X_test)))
        return acc

    def save_model(self, path):
        try:
            joblib.dump(self.estimator, path)
            print("model was saved succesfully")
        except Exception:
            print(Exception)
       

## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [8]:
df = pd.read_csv('../../data/checker_submits.csv', parse_dates=['timestamp'])

In [9]:
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])

In [10]:
data = preprocessing.fit_transform(df, 1)

In [11]:
tr = TrainValidationTest()
X_train, X_valid, X_test, y_train, y_valid, y_test = tr.fit_transform(data.drop(axis=1, labels=['weekday']), data['weekday'])

In [16]:
gs_svc = GridSearchCV(SVC(random_state=21, probability=True),
                  param_grid={'kernel': ['linear', 'rbf', 'sigmoid'],'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma':['scale', 'auto'], 'class_weight':['balanced', None]}, 
                  scoring='accuracy',
                  n_jobs=-1)
gs_tree = GridSearchCV(DecisionTreeClassifier(random_state=21), 
                  param_grid={'max_depth': range(1, 50), 'class_weight':['balanced', None], 'criterion':['entropy', 'gini']})
gs_forest = GridSearchCV(RandomForestClassifier(random_state=21), 
                  param_grid={'n_estimators':[5, 10, 50, 100],'max_depth': range(1, 50), 'class_weight':['balanced', None], 'criterion':['entropy', 'gini']})
ms = ModelSelection([gs_svc, gs_tree, gs_forest],{0: 'SVM', 1: 'Tree', 2: 'Forest'} )

In [17]:
best_model = ms.choose(X_train, y_train, X_test, y_test);

Estimator: SVM


  0%|          | 0/72 [00:00<?, ?it/s]

Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
Best training accuracy: 0.842
Validation set accuracy score for best params: 0.879

Estimator: Tree


  0%|          | 0/196 [00:00<?, ?it/s]

Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 23}
Best training accuracy: 0.852
Validation set accuracy score for best params: 0.888

Estimator: Forest


  0%|          | 0/784 [00:00<?, ?it/s]

Best params: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 23, 'n_estimators': 50}
Best training accuracy: 0.897
Validation set accuracy score for best params: 0.911

Classifier with best validation set accuracy: Forest



In [20]:
ms.best_results()

Unnamed: 0,model,params,valid_score
0,SVM,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.878698
1,Tree,"{'class_weight': None, 'criterion': 'gini', 'm...",0.887574
2,Forest,"{'class_weight': 'balanced', 'criterion': 'ent...",0.911243


In [22]:
f = Finalize(best_model)
fin_score = f.final_score(X_train, y_train, X_test, y_test)
f.save_model("{:}_{:.3f}.sav".format(best_model.name, fin_score))

Accuracy of the final model is 0.9112426035502958
model was saved succesfully
