# Models

In [None]:
from pathlib import Path

import numpy as np
import optuna
import pandas as pd
from imblearn.over_sampling import ADASYN, SMOTE
from joblib import dump
from sklearn.decomposition import SparsePCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

## Reading

In [None]:
df_interim = pd.read_feather(Path('../data/interim/dataset_interim.feather'))

In [None]:
df_interim.info()

## Selecting/Tuning the model

In [None]:
x = df_interim.drop(['fraudulent', 'index'], axis=1)
y = df_interim.drop(['description', 'index'], axis=1)

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(
    x, y, stratify=y, test_size=0.15,
    random_state=42
)

In [None]:
def objective(trial):
    """Tune the model."""
    # Oversample because the dataset is unbalanced.
    #oversample = trial.suggest_categorical('oversample', ['SMOTE', 'ADASYN'])
    #if oversample == 'SMOTE':
    #    k_neighbors = trial.suggest_int('oversample_n_neighbors', 2, 5, 1)
    #    oversampler = SMOTE(
    #        random_state=42, n_jobs=-1, k_neighbors=k_neighbors
    #    )
    #else:
    #    n_neighbors = trial.suggest_int('oversample_n_neighbors', 2, 5, 1)
    #    oversampler = ADASYN(
    #        random_state=42, n_jobs=-1, n_neighbors=n_neighbors
    #    )
    k_neighbors = trial.suggest_int('oversample_n_neighbors', 2, 5, 1)
    oversampler = SMOTE(random_state=42, n_jobs=-1, k_neighbors=k_neighbors)
    
    # curse of dimensionality
    #features_selection = trial.suggest_categorical(
    #    'features_selection', 
    #    ['random','pca']
    #    )
    #if features_selection == 'random':
    #    select = SelectFromModel(RandomForestClassifier(
    #        random_state=42, 
    #        n_jobs=-1
    #        ))
    #else:
    #    n_components = trial.suggest_int('select_n_components', 40, 300, 5)
    #    select = SparsePCA(
    #        n_components=n_components, 
    #        random_state=42, 
    #        n_jobs=-1
    #        )
    select = SelectFromModel(
        RandomForestClassifier(random_state=42, n_jobs=-1)
    )

    idf_params = {
        'ngram_range': trial.suggest_categorical('vect_ngram_range',[1, 2]),
        'max_df': trial.suggest_float('vect_max_df', 0.7, 1),
        'min_df': trial.suggest_float('vect_min_df', 0.001, 0.1),
        'norm': trial.suggest_categorical('vect_norm', ['l1', 'l2']),
        'use_idf': trial.suggest_categorical('vect_use_idf',[True, False]),
    }
    # Re-weight (or not) features with inverse document-frequency.
    pipe = Pipeline([
        (
            'vect', TfidfVectorizer(
                decode_error='replace',
                strip_accents='unicode',
                ngram_range=(1, idf_params['ngram_range']),
                max_df=idf_params['max_df'],
                min_df=idf_params['min_df'],
                norm=idf_params['norm'],
                use_idf=idf_params['use_idf'],
            )
        ),
        # False to work on sparse matrix.
        (
            'std', StandardScaler(with_mean=False)
        ),
    ])
    # models
    classifier = trial.suggest_categorical(
        'clf',
        ['LinearSVC', 'LogisticRegression', 'MultinomialNB']
    )
    match classifier:
        case 'LinearSVC':
            clf_params = {
                'C': trial.suggest_float('clf_C', 1e-10, 1e5),
                'class_weight': trial.suggest_categorical(
                'clf_class_weight', ['balanced', None]
                ),
                'random_state': 42,
                'max_iter': 10_000,
                'fit_intercept': False,
                'loss': trial.suggest_categorical(
                    'clf_loss', ['hinge', 'squared_hinge']
                ),
            }
            clf_obj = LinearSVC(**clf_params)
        case 'LogisticRegression':
            clf_params = {
                'C': trial.suggest_float('clf_C', 1e-10, 1e5),
                'class_weight': trial.suggest_categorical(
                'clf_class_weight', ['balanced', None]
                ),
                'random_state': 42,
                'max_iter': 10_000,
                'fit_intercept': False,
                'l1_ratio': trial.suggest_float(
                    'clf_l1_ratio', 0.001, 0.999
                ),
            }
            clf_obj = LogisticRegression(
                solver='saga', n_jobs=-1, penalty='elasticnet', **clf_params
            )
        case 'MultinomialNB':
            clf_params = {
                'alpha': trial.suggest_float('clf_alpha', 1e-10, 1e5),
                'fit_prior': trial.suggest_categorical(
                    'clf_prior', [True, False]
                )
            }
            clf_obj = MultinomialNB(**clf_params)
    
    # stratification
    skf = StratifiedKFold(5)
    
    x_train = xtrain.reset_index(drop=True)
    y_train = ytrain.reset_index(drop=True)

    x_train['fold'] = -1
    y_train['fold'] = -1

    for fold, (train_id, valid_id) in enumerate(
        skf.split(
            x_train.iloc[:, :-1], 
            y_train.iloc[:, :-1]
        )
    ):
        x_train.loc[valid_id, ['fold']] = fold
        y_train.loc[valid_id, ['fold']] = fold

    scores = []
    for fold in np.unique(x_train.iloc[:, -1]):  
        
        x_train_fold = x_train.loc[
            x_train['fold'] != fold, ['description']
        ]
        y_train_fold = y_train.loc[
            y_train['fold'] != fold, ['fraudulent']
            ].astype('int')

        x_valid = x_train.loc[x_train['fold'] == fold, ['description']]
        y_valid = y_train.loc[
            y_train['fold'] == fold, ['fraudulent']
            ].astype('int')
            
        x = pipe.fit_transform(x_train_fold.iloc[:, -1])
        xvalid = pipe.transform(x_valid.iloc[:, -1])
    
        x, y = oversampler.fit_resample(x, y_train_fold.iloc[:, -1])
    
        x = select.fit_transform(x.toarray(), y) 
        xvalid = select.transform(xvalid.toarray())

        clf_obj.fit(x, y)
        y_pred = clf_obj.predict(xvalid)

        score = balanced_accuracy_score(y_valid, y_pred, adjusted=True)
        scores.append(score)

    return np.mean(scores) 

In [None]:
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=2),
    storage='sqlite:///../results/train_opt.db', 
    study_name='control',
    load_if_exists=True
)
study.optimize(objective, n_trials=60)

In [None]:
results = optuna.load_study(
    study_name='control', storage='sqlite:///../results/train_opt.db'
)

In [None]:
results.best_value

In [None]:
results.best_params

## Training

In [None]:
def train(x_train, y_train, x_valid, **kwargs):
    """Train the best model."""
    pipe = Pipeline([
        ('vect', TfidfVectorizer(
            max_df=kwargs['vect_max_df'],
            min_df=kwargs['vect_min_df'],
            ngram_range=(1, kwargs['vect_ngram_range']),
            norm=kwargs['vect_norm'],
            use_idf=kwargs['vect_use_idf'])),
        ('std', StandardScaler(with_mean=False)),
    ])
    
    oversampler = SMOTE(
        random_state=42, 
        n_jobs=-1, 
        k_neighbors=kwargs['oversample_n_neighbors']
    )
    
    select = SelectFromModel(
        RandomForestClassifier(random_state=42, n_jobs=-1)
    )
    
    clf_obj = LogisticRegression(
        solver='saga', 
        n_jobs=-1,
        penalty='elasticnet',
        C=kwargs['clf_C'],
        class_weight=kwargs['clf_class_weight'],
        l1_ratio=kwargs['clf_l1_ratio'],
        random_state=42,
        max_iter=10_000,
        fit_intercept=False
        )
    
    x = pipe.fit_transform(x_train.iloc[:, -1])
    xvalid = pipe.transform(x_valid.iloc[:, -1])
    
    x, y = oversampler.fit_resample(x, y_train.iloc[:, -1])
    
    x = select.fit_transform(x.toarray(), y) 
    xvalid = select.transform(xvalid.toarray())

    clf_obj.fit(x, y)
    y_pred = clf_obj.predict(xvalid)
    y_hat = clf_obj.decision_function(xvalid)
    
    return pipe, select, clf_obj, y_pred, y_hat


In [None]:
pipe, select, clf_obj, y_pred, y_hat = train(
    xtrain, ytrain, xvalid, **results.best_params
)

In [None]:
balanced_accuracy_score(yvalid, y_pred)

## Dumping the model

In [None]:
dump(pipe, Path('../models/pipe.joblib'))
dump(select, Path('../models/select.joblib'))
dump(clf_obj, Path('../models/model.joblib'))