# Day 09. Exercise 04
# Pipelines and OOP

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


## 0. Imports

In [211]:
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid
import joblib
from tqdm.notebook import tqdm

## 1. Preprocessing pipeline

In [212]:
class FeatureExtractor(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        X_ = X.copy()
        X_['timestamp'] = pd.to_datetime(X_['timestamp'])
        X_['hour'] = X_['timestamp'].dt.hour
        X_['dayofweek'] = X_['timestamp'].dt.weekday
        X_ = X_.drop(columns=['timestamp'])
        return X_
    

class MyOneHotEncoder(BaseEstimator,TransformerMixin):
    def __init__(self,target_column):
        self.target_column = target_column
        self.onehotcoder = None
        self.categ_cols = None

    def fit(self, X, y = None):
        self.categ_cols = X.select_dtypes(include=['object','category']).columns.tolist()
        if self.target_column in self.categ_cols:
            self.categ_cols.remove(self.target_column)
        self.onehotcoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.onehotcoder.fit(X[self.categ_cols])
        return self
    
    def transform(self, X):
        X_ = X.copy().reset_index(drop=True)
        X_cat = pd.DataFrame(
            self.onehotcoder.transform(X_[self.categ_cols]),
            columns =self.onehotcoder.get_feature_names_out(self.categ_cols),
            index = X_.index
        )
        X_ = X_.drop(columns=self.categ_cols)
        y = X_[self.target_column]
        X_ = X_.drop(columns=[self.target_column])
        X_transformed = pd.concat([X_,X_cat],axis=1)
        return X_transformed, y
    
class TrainValidationTest:
    def __init__(self,test_size = 0.2,random_state =21):
        self.test_size = test_size
        self.random_state = random_state

    def split(self,X,y):
        min_count = y.value_counts().min()
        stratify = y if min_count > 1 else None

        #First split: train+valid vs test
        X_train_val, X_test, y_train_val, y_test = train_test_split(
            X,y,test_size=self.test_size,random_state=self.random_state,stratify=stratify)
        
        #Second split: train vs valid
        min_count_val = y_train_val.value_counts().min()
        stratify_val = y_train_val if min_count_val > 1 else None

        X_train,X_valid,y_train,y_valid = train_test_split(
            X_train_val,y_train_val,test_size=self.test_size,random_state=self.random_state,stratify=stratify_val
        )

        return X_train, X_valid, X_test, y_train, y_valid, y_test


In [213]:
df = pd.read_csv('../data/checker_submits.csv')

feature_pipeline = Pipeline([
    ("features",FeatureExtractor()),
    ("myonehotencode",MyOneHotEncoder(target_column='dayofweek'))
    
])

X_encoded,y = feature_pipeline.fit_transform(df)

splitter = TrainValidationTest()
X_train, X_valid, X_test, y_train, y_valid, y_test = splitter.split(X_encoded,y)



## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.772727
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.801484
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.855288
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [214]:
class ModelSelection:
    def __init__(self, grids, grid_dict):
        self.grids = grids
        self.grid_dict = grid_dict
        self.results = []

    def choose(self,X_train,y_train,X_valid,y_valid):
        best_score = -1
        best_model_name = None

        for i, grid in enumerate(self.grids):
            model_name = self.grid_dict[i]
            estimator = grid.estimator
            param_grid = grid.param_grid
            print(f"\nEstimator: {model_name}")
            

            best_local_score = -1
            best_local_model = None
            best_local_params = None

            for params in tqdm(ParameterGrid(param_grid)):
                estimator.set_params(**params)
                estimator.fit(X_train,y_train)

                valid_score = estimator.score(X_valid,y_valid)

                if valid_score > best_local_score:
                    best_local_score = valid_score
                    best_local_model = estimator
                    best_local_params = params
            print(f"Best params: {best_local_params}")
            print(f"Validation set accuracy score for best params: {best_local_score:.5f}")
                
            self.results.append({
                    'model': model_name,
                    'params':best_local_params,
                    'valid_score': best_local_score
                })
            if best_local_score > best_score:
                best_score = best_local_score
                best_model_name = model_name
        print(f"\nClassifier with best validation set accuracy: {best_model_name.title()}")
        return best_model_name

    def best_results(self):
        return pd.DataFrame(self.results)



In [215]:
svm = SVC()
tree = DecisionTreeClassifier()
rf = RandomForestClassifier()


tree_params = {'max_depth':list(range(1,50)),'class_weight':['balanced',None],'criterion':['entropy','gini'],'random_state':[21]}
svm_params = {'kernel':('linear','rbf','sigmoid'),
               'C':[0.01,0.1,1,1.5,5,10],'gamma':['scale','auto'],
               'class_weight':('balanced',None),'random_state':[21],'probability':[True]}
rf_params = {'n_estimators':[5,10,50,100],
              'max_depth':list(range(1,50)),
              'class_weight':['balanced',None],
              'criterion':['entropy','gini'],
              'random_state':[21]}

gs_svm = GridSearchCV(estimator=svm,param_grid=svm_params,scoring='accuracy',cv=2,n_jobs=4)
gs_tree = GridSearchCV(estimator=tree,param_grid=tree_params,scoring='accuracy',cv=2,n_jobs=4)
gs_rf = GridSearchCV(estimator=rf,param_grid=rf_params,scoring='accuracy',n_jobs=4)

grids = [gs_svm,gs_tree,gs_rf]


grid_dict = {0:'SVM',1:'Decision',2:'Random Forest'}

selector = ModelSelection(grids,grid_dict)
best_model_name = selector.choose(X_train,y_train, X_valid,y_valid)
results_df = selector.best_results()
results_df


Estimator: SVM


  0%|          | 0/72 [00:00<?, ?it/s]

Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Validation set accuracy score for best params: 0.87778

Estimator: Decision


  0%|          | 0/196 [00:00<?, ?it/s]

Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 19, 'random_state': 21}
Validation set accuracy score for best params: 0.88889

Estimator: Random Forest


  0%|          | 0/784 [00:00<?, ?it/s]

Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 20, 'n_estimators': 50, 'random_state': 21}
Validation set accuracy score for best params: 0.91111

Classifier with best validation set accuracy: Random Forest


Unnamed: 0,model,params,valid_score
0,SVM,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.877778
1,Decision,"{'class_weight': None, 'criterion': 'entropy',...",0.888889
2,Random Forest,"{'class_weight': 'balanced', 'criterion': 'gin...",0.911111


## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [216]:
class Finalize:
    def __init__(self,estimator):
        self.estimator = estimator
        final_score = None
        
        pass
    def final_score(self,X_train,y_train,X_test,y_test):
        self.estimator.fit(X_train,y_train)
        self.final_score = self.estimator.score(X_test,y_test)
        return self.final_score


    def save_model(self,path='../data/',model_name = 'final_model.pkl'):
        joblib.dump(self.estimator,path+model_name)

In [217]:

rf = RandomForestClassifier(class_weight='balanced',criterion='gini',max_depth=20,n_estimators=50,random_state=21)
final = Finalize(estimator=rf)
model = final.final_score(X_train,y_train,X_test,y_test)
print(f'Accuracy of the final model is {model}')
final.save_model(path='')

Accuracy of the final model is 0.9023668639053254


## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [218]:
model = joblib.load('final_model.pkl')
model

In [219]:
preprocessing = Pipeline([
    ('feature_extractor',FeatureExtractor()),
    ('onehot_encoder',MyOneHotEncoder('dayofweek'))
])

In [220]:
data = preprocessing.fit_transform(df)

In [221]:
X,y = data

In [222]:
X_train, X_valid, X_test, y_train, y_valid, y_test = TrainValidationTest().split(X,y)

In [223]:
tree_params = {'max_depth':list(range(20,70)),'class_weight':['balanced',None],
               'criterion':['entropy','gini'],'random_state':[21]}
svm_params = {'kernel':('linear','rbf','sigmoid'),
               'C':[0.01,0.1,1,1.5,5,10],'gamma':['scale','auto'],
               'class_weight':('balanced',None),'random_state':[21],'probability':[True]}
rf_params = {'n_estimators':[5,10,50,100],
              'max_depth':list(range(1,50)),
              'class_weight':['balanced',None],
              'criterion':['entropy','gini'],
              'random_state':[21]}

gs_svm = GridSearchCV(estimator=SVC(), param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=2)
gs_tree = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=tree_params, scoring='accuracy', cv=2, n_jobs=2)
gs_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_params, scoring='accuracy', cv=2, n_jobs=2)


grids = [gs_svm, gs_tree, gs_rf]
grid_dict = {0: 'SVM', 1: 'Decision Tree', 2: 'Random Forest'}

model_select = ModelSelection(grids,grid_dict)
best_model = model_select.choose(X_train,y_train,X_valid,y_valid)


Estimator: SVM


  0%|          | 0/72 [00:00<?, ?it/s]

Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Validation set accuracy score for best params: 0.87778

Estimator: Decision Tree


  0%|          | 0/200 [00:00<?, ?it/s]

Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 20, 'random_state': 21}
Validation set accuracy score for best params: 0.88889

Estimator: Random Forest


  0%|          | 0/784 [00:00<?, ?it/s]

Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 20, 'n_estimators': 50, 'random_state': 21}
Validation set accuracy score for best params: 0.91111

Classifier with best validation set accuracy: Random Forest


In [224]:
rf = RandomForestClassifier(class_weight='balanced',criterion='gini',max_depth=20,n_estimators=50, random_state=21)

In [227]:
final = Finalize(estimator=rf)
model_score = final.final_score(X_train,y_train,X_test,y_test)
model_name = rf.__class__.__name__+'_'+f'{model_score:.5f}'+'.sav'
final.save_model(path='',model_name=model_name)