# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

KeyboardInterrupt: 

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [None]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['timestamp'] = pd.to_datetime(X['timestamp'])
        X['hour'] = X['timestamp'].dt.hour
        X['weekday'] = X['timestamp'].dt.dayofweek
        X = X.drop('timestamp', axis=1)
        return X
        
df = pd.read_csv('../data/checker_submits.csv')
df = FeatureExtractor().fit_transform(df)
print(df.head())

      uid   labname  numTrials  hour  weekday
0  user_4  project1          1     5        4
1  user_4  project1          2     5        4
2  user_4  project1          3     5        4
3  user_4  project1          4     5        4
4  user_4  project1          5     5        4


In [None]:
class MyOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, targ):
        self.targ = targ

    def fit(self, X, y=None):
        y=self.targ
        self.cols = list(X.select_dtypes(include=['object']).columns)
        if y in self.cols:
            self.cols.remove(y)
        return self

    def transform(self, X):
        X = X.copy()
        X = pd.get_dummies(X, columns=self.cols)
        return X
    
print(MyOneHotEncoder('uid').fit_transform(df).head())
df = MyOneHotEncoder('weekday').fit_transform(df)
print(df.head())

      uid  numTrials  hour  weekday  labname_code_rvw  labname_lab02  \
0  user_4          1     5        4             False          False   
1  user_4          2     5        4             False          False   
2  user_4          3     5        4             False          False   
3  user_4          4     5        4             False          False   
4  user_4          5     5        4             False          False   

   labname_lab03  labname_lab03s  labname_lab05s  labname_laba04  \
0          False           False           False           False   
1          False           False           False           False   
2          False           False           False           False   
3          False           False           False           False   
4          False           False           False           False   

   labname_laba04s  labname_laba05  labname_laba06  labname_laba06s  \
0            False           False           False            False   
1            Fal

In [None]:
class TrainValidationTest():
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.split()

    def split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=21, stratify=self.y)
        
        self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(
            self.X_train, self.y_train, test_size=0.25, random_state=21, stratify=self.y_train)
        
        return self.X_train, self.X_valid, self.X_test, self.y_train, self.y_valid, self.y_test
    
   
X = df.drop('weekday', axis=1)
y = df['weekday']
X_train, X_valid, X_test, y_train, y_valid, y_test = TrainValidationTest(X, y).split()


## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.772727
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.801484
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.855288
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [None]:
class ModelSelection:
    def __init__(self, grids, grid_dict):
        self.grids = grids
        self.grid_dict = grid_dict
        self.best_models = []

    def choose(self, X_train, y_train, X_valid, y_valid):
        best_score = 0
        best_model_name = None

        for idx, grid in enumerate(self.grids):
            model_name = self.grid_dict[idx]
            print(f"Estimator: {model_name}")
            grid.fit(X_train, y_train)
            best_params = grid.best_params_
            best_train_score = grid.best_score_
            valid_score = grid.score(X_valid, y_valid)
            print(f"Best params: {best_params}")
            print(f"Best training accuracy: {best_train_score:.3f}")
            print(f"Validation set accuracy score for best params: {valid_score:.3f}\n")

            self.best_models.append({
                'model': model_name,
                'params': best_params,
                'valid_score': valid_score
            })

            if valid_score > best_score:
                best_score = valid_score
                best_model_name = model_name

        print(f"Classifier with best validation set accuracy: {best_model_name}")
        return best_model_name

    def best_results(self):
        return pd.DataFrame(self.best_models)

jobs = -1  
svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
tree_params = [{'criterion':('gini', 'entropy'), 'max_depth':[None, 5, 10, 15, 20], 'min_samples_split':[2, 5, 10], 'min_samples_leaf':[1, 2, 4], 'random_state':[21]}]
rf_params = [{'n_estimators':[100, 200, 300], 'criterion':('gini', 'entropy'), 'max_depth':[None, 5, 10, 15, 20], 'min_samples_split':[2, 5, 10], 'min_samples_leaf':[1, 2, 4], 'random_state':[21]}]

gs_svm = GridSearchCV(estimator=SVC(), param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs)
gs_tree = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=tree_params, scoring='accuracy', cv=2, n_jobs=jobs)
gs_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_params, scoring='accuracy', cv=2, n_jobs=jobs)

grids = [gs_svm, gs_tree, gs_rf]
grid_dict = {0: 'SVM', 1: 'Decision Tree', 2: 'Random Forest'}

model_selection = ModelSelection(grids, grid_dict)
best_model = model_selection.choose(X_train, y_train, X_valid, y_valid)
print(model_selection.best_results())

Estimator: SVM


Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.752
Validation set accuracy score for best params: 0.855

Estimator: Decision Tree
Best params: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 21}
Best training accuracy: 0.802
Validation set accuracy score for best params: 0.864

Estimator: Random Forest
Best params: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 21}
Best training accuracy: 0.860
Validation set accuracy score for best params: 0.881

Classifier with best validation set accuracy: Random Forest
           model                                             params  \
0            SVM  {'C': 10, 'class_weight': None, 'gamma': 'auto...   
1  Decision Tree  {'criterion': 'gini', 'max_depth': None, 'min_...   
2  Random Forest  {'criterion': 'gini', 'm

## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [None]:
class Finalize():
    def __init__(self, estimator):
        self.estimator = estimator

    def final_score(self, X_train, y_train, X_test, y_test):
        acc = accuracy_score(y_test, self.estimator.predict(X_test))
        print(f"Accuracy of the final Model is {acc}")
        return acc

    def save_model(self, path):
        joblib.dump(self.estimator, path)
        
# final = Finalize(gs_rf.best_estimator_)
# final.final_score(X_train, y_train, X_test, y_test)

## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [None]:
file_name = '../data/checker_submits.csv'
df = pd.read_csv(file_name)

In [None]:
preprocessing = Pipeline([
    ('feature_extractor', FeatureExtractor()), 
    ('onehot_encoder', MyOneHotEncoder('weekday'))
])

In [None]:
data = preprocessing.fit_transform(df)

In [None]:
X = data.drop('weekday', axis=1)
y = data['weekday']

In [None]:
X_train, X_valid, X_test, y_train, y_valid, y_test = TrainValidationTest(X, y).split()

In [None]:
jobs = -1  
svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
tree_params = [{'criterion':('gini', 'entropy'), 'max_depth':[None, 5, 10, 15, 20], 'min_samples_split':[2, 5, 10], 'min_samples_leaf':[1, 2, 4], 'random_state':[21]}]
rf_params = [{'n_estimators':[100, 200, 300], 'criterion':('gini', 'entropy'), 'max_depth':[None, 5, 10, 15, 20], 'min_samples_split':[2, 5, 10], 'min_samples_leaf':[1, 2, 4], 'random_state':[21]}]

gs_svm = GridSearchCV(estimator=SVC(), param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs)
gs_tree = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=tree_params, scoring='accuracy', cv=2, n_jobs=jobs)
gs_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_params, scoring='accuracy', cv=2, n_jobs=jobs)

grids = [gs_svm, gs_tree, gs_rf]
grid_dict = {0: 'SVM', 1: 'Decision Tree', 2: 'Random Forest'}

In [None]:
model_selection = ModelSelection(grids, grid_dict)

In [None]:
best_model = model_selection.choose(X_train, y_train, X_valid, y_valid)

Estimator: SVM
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.752
Validation set accuracy score for best params: 0.855

Estimator: Decision Tree
Best params: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 21}
Best training accuracy: 0.802
Validation set accuracy score for best params: 0.864

Estimator: Random Forest
Best params: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300, 'random_state': 21}
Best training accuracy: 0.860
Validation set accuracy score for best params: 0.881

Classifier with best validation set accuracy: Random Forest


In [None]:
final = Finalize(gs_rf.best_estimator_)
acc = final.final_score(X_train, y_train, X_test, y_test)

Accuracy of the final Model is 0.908284023668639


In [None]:
final.save_model(f'../data/Random_Forest_{acc:.3f}.sav')

In [None]:
model = joblib.load('../data/Random_Forest_0.908.sav')
final = Finalize(model)
acc = final.final_score(X_train, y_train, X_test, y_test)

Accuracy of the final Model is 0.908284023668639
