# Ex04 Pipelines and OOP

In [130]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import numpy as np
import joblib
from tqdm.notebook import tqdm
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

## 1. Preprocessing pipeline

In [18]:
class FeatureExtractor():
    def __init__(self,data):
        self.data = data
    def extract_df(self):
        new_df = self.data
        new_df.timestamp = pd.to_datetime(new_df.timestamp)
        hours = new_df.timestamp.dt.hour
        weekdays = new_df.timestamp.dt.weekday
        new_df['hour'] = hours
        new_df['weekday'] = weekdays
        new_df = new_df.drop(columns='timestamp')
        return new_df



In [28]:
data = pd.read_csv('../data/checker_submits.csv')
extractor = FeatureExtractor(data)
new_df = extractor.extract_df()
new_df['uid'] = new_df.uid.astype('category')
new_df['labname'] = new_df.labname.astype('category')

In [29]:
new_df

Unnamed: 0,uid,labname,numTrials,hour,weekday
0,user_4,project1,1,5,4
1,user_4,project1,2,5,4
2,user_4,project1,3,5,4
3,user_4,project1,4,5,4
4,user_4,project1,5,5,4
...,...,...,...,...,...
1681,user_19,laba06s,9,20,3
1682,user_1,laba06s,6,20,3
1683,user_1,laba06s,7,20,3
1684,user_1,laba06s,8,20,3


In [124]:
class MyOneHotEncoder():
    def __init__(self,df,target):
        self.df = df
        self.target = target
    def encode_df(self):
        encoder = OneHotEncoder(sparse_output=False)
        categorical_cols = []
        for col in self.df.columns:
            if self.df[col].dtype == 'category' and col != self.target:
                categorical_cols.append(col)

        encoded_features = encoder.fit_transform(data[categorical_cols])
        encoded_columns = encoder.get_feature_names_out(categorical_cols)
        encode_df = pd.concat([self.df,pd.DataFrame(encoded_features, columns = encoded_columns)],axis=1).drop(columns=categorical_cols)
        return encode_df


In [125]:
encoder = MyOneHotEncoder(new_df, 'weekday')
encode_df = encoder.encode_df()
encode_df

Unnamed: 0,numTrials,hour,weekday,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,6,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,7,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,8,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [24]:
class TrainValidationTest:
    def __init__(self, df):
        self.X = df.drop(columns='weekday')
        self.y = df['weekday']
    def train_test(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X,self.y,test_size =0.2, random_state=21, stratify=self.y)
        return X_train, X_test, y_train, y_test

In [52]:
train = TrainValidationTest(encode_df)
X_train, X_test, y_train, y_test = train.train_test()

## 2. Model selection pipeline

In [57]:
svm = SVC(probability=True, random_state=21)
svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 
               'C':[0.01, 0.1, 1, 1.5, 5, 10], 
               'gamma': ['scale', 'auto'], 
               'class_weight':('balanced', None), 
               'random_state':[21], 
               'probability':[True]}]
gs_svm =  GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=-1)

In [58]:
tree = DecisionTreeClassifier(random_state=21)
tree_params = [ {'max_depth': [1,5,10,20,30,40,45,49],
    'class_weight': ['balanced', None],
    'criterion':['entropy','gini']}]
gs_tree = GridSearchCV(estimator=tree, param_grid=tree_params, scoring='accuracy', cv=2, n_jobs=-1)

In [59]:
forest = RandomForestClassifier(random_state=21)
forest_params = [ {
    'max_depth': [1,5,10,20,30,40,45,49],
    'n_estimators' : [5,10,50,100],
    'class_weight': ['balanced', None],
    'criterion':['entropy','gini']
}]
gs_rf = GridSearchCV(estimator=forest, param_grid=forest_params, scoring='accuracy', cv=2, n_jobs=-1)

In [66]:
grids = [gs_svm, gs_tree, gs_rf]
grid_dict = {0 : 'SVC', 1: 'DecisionTreeClassifier', 2 : 'RandomForestClassifier'}

In [105]:
class ModelSelection():
    def __init__(self, grids, grid_dict):
        self.grids = grids
        self.grid_dict = grid_dict
    def choose(self, X_train, y_train, X_valid, y_valid):
        best_model_name = None
        best_score = 0
        for i, grid in enumerate(self.grids):
            model_name = self.grid_dict[i]
            print(f"Estimator {model_name}")
            total_fits = np.prod([len(v) for v in grid.param_grid[0].values()]) * 2
            with tqdm(total=total_fits, desc=f"{model_name}", ncols=100, unit="it/s") as pbar:
                grid.fit(X_train, y_train)
                
                valid_score = grid.score(X_valid, y_valid)
                if valid_score > best_score:
                    best_score = valid_score
                    best_model_name = model_name

                pbar.update(total_fits)    
                print(f"Best params: {grid.best_params_}")
                print(f"Best training accuracy: {grid.best_score_:.3f}")
                print(f"Validation set accuracy score for best params: {valid_score:.3f}")

        print(f"\nClassifier with best validation set accuracy: {best_model_name}")
        return best_model_name
    
    def best_results(self, X_train, y_train, X_valid, y_valid):
        models = []
        params =  []
        scores = []
        for i, grid in enumerate(self.grids):
            model_name = self.grid_dict[i]
            models.append(model_name)
            grid.fit(X_train, y_train)
            params.append(grid.best_params_)
            scores.append(grid.score(X_valid, y_valid))
        return pd.DataFrame({'model': models, 'param' : params, 'valid_score': scores})

        


In [106]:
grid_class = ModelSelection(grids, grid_dict)

In [108]:
grid_class.choose(X_train, y_train, X_test, y_test)

Estimator SVC


SVC:   0%|                                                                | 0/144 [00:00<?, ?it/s/s]

Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.809
Validation set accuracy score for best params: 0.888
Estimator DecisionTreeClassifier


DecisionTreeClassifier:   0%|                                              | 0/64 [00:00<?, ?it/s/s]

Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 30}
Best training accuracy: 0.829
Validation set accuracy score for best params: 0.867
Estimator RandomForestClassifier


RandomForestClassifier:   0%|                                             | 0/256 [00:00<?, ?it/s/s]

Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 30, 'n_estimators': 100}
Best training accuracy: 0.874
Validation set accuracy score for best params: 0.932

Classifier with best validation set accuracy: RandomForestClassifier


'RandomForestClassifier'

In [107]:
grid_class.best_results(X_train, y_train, X_test, y_test)

Unnamed: 0,model,param,valid_score
0,SVC,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.887574
1,DecisionTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",0.866864
2,RandomForestClassifier,"{'class_weight': None, 'criterion': 'entropy',...",0.931953


## 3. Finalization

In [148]:
class Finalize():
    def __init__(self, estimator):
        self.estimator = estimator
    def final_score(self, X_train, y_train, X_test, y_test):
        self.estimator.fit(X_train, y_train)
        y_pred = self.estimator.predict(X_test)
        print(f"Accuracy of the final model is {accuracy_score(y_test, y_pred)}")
        return accuracy_score(y_test, y_pred)
    def save_model(self, path):
        if joblib.dump(self.estimator, f'{path}'):
            print(f"Model was successfuly saved on path: {path}")
        else:
            raise Exception("Error while saving model")


In [144]:
final_model = Finalize(forest)

In [145]:
final_model.final_score(X_train, y_train, X_test, y_test)

Accuracy of the final model is 0.9378698224852071


In [146]:
final_model.save_model('../data/rf_model.joblib')

Model was successfuly saved on path: ../data/rf_model.joblib


## 4. Main program

In [131]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_new = X.copy() 
        X_new['timestamp'] = pd.to_datetime(X_new['timestamp'])
        X_new['hour'] = X_new['timestamp'].dt.hour  
        X_new['weekday'] = X_new['timestamp'].dt.weekday  
        X_new = X_new.drop(columns='timestamp')
        return X_new

In [134]:
class MyOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, target):
        self.target = target

    def fit(self, X, y=None):
        self.encoder = OneHotEncoder(sparse_output=False)
        self.categorical_cols = [col for col in X.columns if X[col].dtype == 'category' and col != self.target]
        self.encoder.fit(X[self.categorical_cols])
        return self

    def transform(self, X):
        encoded_features = self.encoder.transform(X[self.categorical_cols])
        encoded_columns = self.encoder.get_feature_names_out(self.categorical_cols)
        encoded_df = pd.DataFrame(encoded_features, columns=encoded_columns)
        X_new = pd.concat([X, encoded_df], axis=1).drop(columns=self.categorical_cols)
        
        return X_new

In [135]:
data = pd.read_csv('../data/checker_submits.csv')
data['uid'] = data.uid.astype('category')
data['labname'] = data.labname.astype('category')

In [136]:
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])

In [137]:
df = preprocessing.fit_transform(data)

In [138]:
train = TrainValidationTest(df)
X_train, X_test, y_train, y_test = train.train_test()

In [139]:
grids = [gs_svm, gs_tree, gs_rf]
grid_dict = {0 : 'SVC', 1: 'DecisionTreeClassifier', 2 : 'RandomForestClassifier'}
grid_class = ModelSelection(grids, grid_dict)

In [140]:
grid_class.choose(X_train, y_train, X_test, y_test)

Estimator SVC


SVC:   0%|                                                                | 0/144 [00:00<?, ?it/s/s]

Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.809
Validation set accuracy score for best params: 0.888
Estimator DecisionTreeClassifier


DecisionTreeClassifier:   0%|                                              | 0/64 [00:00<?, ?it/s/s]

Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 30}
Best training accuracy: 0.829
Validation set accuracy score for best params: 0.867
Estimator RandomForestClassifier


RandomForestClassifier:   0%|                                             | 0/256 [00:00<?, ?it/s/s]

Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 30, 'n_estimators': 100}
Best training accuracy: 0.874
Validation set accuracy score for best params: 0.932

Classifier with best validation set accuracy: RandomForestClassifier


'RandomForestClassifier'

In [141]:
grid_class.best_results(X_train, y_train, X_test, y_test)

Unnamed: 0,model,param,valid_score
0,SVC,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.887574
1,DecisionTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",0.866864
2,RandomForestClassifier,"{'class_weight': None, 'criterion': 'entropy',...",0.931953


In [150]:
final_model = Finalize(forest)
accuracy = final_model.final_score(X_train, y_train, X_test, y_test)


Accuracy of the final model is 0.9378698224852071


In [151]:
final_model.save_model(f'final_model_{accuracy:.3f}.sav')

Model was successfuly saved on path: final_model_0.938.sav
