## Model selection 

Librairies

In [2]:
import utilities
import pandas            as pd
import numpy             as np
import seaborn           as sns
import matplotlib.pyplot as plt
import re
import sklearn
import random
import textdistance
import umap

from sklearn.tree              import DecisionTreeClassifier
from sklearn.model_selection   import train_test_split
from sklearn                   import metrics
from sklearn.model_selection   import GridSearchCV
from transformers              import BertTokenizer, BertModel
from sentence_transformers     import SentenceTransformer
from sklearn.metrics           import roc_curve
from umap                      import UMAP
from typing                    import Optional
from sklearn.decomposition     import PCA
from sklearn.linear_model      import LogisticRegression

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


Import data

In [3]:
chanel_junk_valid_new = pd.read_excel('C:/Users/a.tekiouk/Sujet_2/Sujet_2/DATA/chanel_junk_valid_2.xlsx')
model = SentenceTransformer('all-MiniLM-L6-v2')
X = model.encode(chanel_junk_valid_new['text'])
y = chanel_junk_valid_new['is_junk']

Selection

In [4]:
def fit_params(
    X: pd.DataFrame,
    y: pd.Series,
    grid_params: dict,
    predictor = DecisionTreeClassifier(),
    hyperopt_params: Optional[dict] = None,
)-> dict:
    """
    Fits a given model on the provided dataset after reducing dimensions with the provided reducer.

    Parameters
    ----------
    X : pd.DataFrame
        The input DataFrame containing the features.
    y : pd.Series
        The target variable.
    grid_params : dict
        The dictionary of hyperparameter grids to search over using GridSearchCV.
    predictor : object, optional
        The model object implementing the scikit-learn estimator interface, by default DecisionTreeClassifier().
    hyperopt_params : dict, optional
        Additional parameters to be passed to the GridSearchCV for hyperparameter optimization, by default None.

    Returns
    -------
    dict
        The best parameters found by the hyperparameter optimization.
    """
    if hyperopt_params is None:
        hyperopt_params = {}
    
    # perform hyperopt with model on reduced data
    return (
        GridSearchCV(
            estimator=predictor,
            param_grid=grid_params,
            **hyperopt_params
        )
        .fit(X, y)
        .best_params_
    )
    
    # fit model with optimal parameters
    return predictor.set_params(**optimal_params).fit(X_reduced, y)

In [5]:
def evaluate(
    y_obs: pd.Series,
    y_pred: pd.Series,
    y_score: pd.Series,
) -> None:
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score)
    print(f"AUC : {metrics.auc(fpr, tpr):.3f}")
    print(f"Accuracy score : {metrics.accuracy_score(y_test, y_pred):.3f}")
    print(f"Precision score : {metrics.precision_score(y_test, y_pred):.3f}")
    print(f"Recall score : {metrics.recall_score(y_test, y_pred):.3f}")
    utilities.plot_confusion_matrix(y_true=y_test, y_pred=y_pred)

In [28]:
def auto_model_selection(
    X : pd.Series,
    y : pd.Series,
    dimension_range : range
    )-> tuple[object, float, int]:
    """
    Automatically selects the best model using dimensionality reduction techniques (PCA, UMAP)
    and evaluates their performance based on ROC AUC score.

    Parameters
    ----------
    X : pd.Series
        The input feature data.
    y : pd.Series
        The target variable.
    dimension_range : range
        The range of dimensions to explore for dimensionality reduction.

    Returns
    -------
    Tuple[object, float, int]
        A tuple containing the best model, the best ROC AUC score, and the optimal number of dimensions.
    """
    best_auc= 0
    # PCA DecisionTree
    for dim in dimension_range:
        reducer = PCA(n_components=dim)
        predictor = DecisionTreeClassifier(random_state = 42)
        X_reduced = reducer.fit(X).transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_reduced,y, test_size=0.2, random_state=42)
        opt_params = fit_params(
            X=X_train,
            y=y_train,
            predictor=predictor,
            reducer=reducer,
            grid_params={
                'criterion': ['entropy', 'gini', 'log_loss'],
                'max_depth' : np.arange(2, 6, dtype=int),
                'ccp_alpha' : np.linspace(0.0, 0.20, 5)
            },
            hyperopt_params={
                "scoring": "roc_auc",
                "cv": 5,
                "refit": True
            })
        model = predictor.set_params(**opt_params).fit(x_train, y_train)
        y_pred = model.predict(x_test)
        y_score = model.predict_proba(x_test)[:, 1]
        fpr, tpr, _ = metrics.roc_curve(y_test, y_score)
        auc = metrics.auc(fpr, tpr)
        if(auc>best_auc):
            best_auc = auc
            best_model = model
            opt_nb_dim = dim
    # UMAP DecisionTree
    for dim in dimension_range:
        reducer = UMAP(n_components=dim)
        predictor = DecisionTreeClassifier(random_state = 42)
        X_reduced = reducer.fit(X).transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_reduced,y, test_size=0.2, random_state=42)
        opt_params = fit_params(
            X=X_train,
            y=y_train,
            predictor=predictor,
            reducer=reducer,
            grid_params={
                'criterion': ['entropy', 'gini', 'log_loss'],
                'max_depth' : np.arange(2, 6, dtype=int),
                'ccp_alpha' : np.linspace(0.0, 0.20, 5)
            },
            hyperopt_params={
                "scoring": "roc_auc",
                "cv": 5,
                "refit": True
            })
        model = predictor.set_params(**opt_params).fit(x_train, y_train)
        y_pred = model.predict(x_test)
        y_score = model.predict_proba(x_test)[:, 1]
        fpr, tpr, _ = metrics.roc_curve(y_test, y_score)
        auc = metrics.auc(fpr, tpr)
        if(auc>best_auc):
            best_auc = auc
            best_model = model
            opt_nb_dim = dim
    # PCA Logistic Regression
    for dim in dimension_range:
        reducer = PCA(n_components=dim)
        predictor = LogisticRegression(random_state = 42)
        X_reduced = reducer.fit(X).transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_reduced,y, test_size=0.2, random_state=42)
        opt_params = fit_params(
            X=X_train,
            y=y_train,
            predictor=predictor,
            reducer=reducer,
            grid_params={
                'criterion': ['lbfgs', 'liblinear', 'newton-cg','newton-cholesky','sag','saga']
            },
            hyperopt_params={
                "scoring": "roc_auc",
                "cv": 5,
                "refit": True
            })
        model = predictor.set_params(**opt_params).fit(x_train, y_train)
        y_pred = model.predict(x_test)
        y_score = model.predict_proba(x_test)[:, 1]
        fpr, tpr, _ = metrics.roc_curve(y_test, y_score)
        auc = metrics.auc(fpr, tpr)
        if(auc>best_auc):
            best_auc = auc
            best_model = model
            opt_nb_dim = dim
    # UMAP Logistic Regression
    for dim in dimension_range:
        reducer = UMAP(n_components=dim)
        predictor = LogisticRegression(random_state = 42)
        X_reduced = reducer.fit(X).transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_reduced,y, test_size=0.2, random_state=42)
        opt_params = fit_params(
            X=X_train,
            y=y_train,
            predictor=predictor,
            reducer=reducer,
            grid_params={
                'criterion': ['lbfgs', 'liblinear', 'newton-cg','newton-cholesky','sag','saga']
            },
            hyperopt_params={
                "scoring": "roc_auc",
                "cv": 5,
                "refit": True
            })
        model = predictor.set_params(**opt_params).fit(x_train, y_train)
        y_pred = model.predict(x_test)
        y_score = model.predict_proba(x_test)[:, 1]
        fpr, tpr, _ = metrics.roc_curve(y_test, y_score)
        auc = metrics.auc(fpr, tpr)
        if(auc>best_auc):
            best_auc = auc
            best_model = model
            opt_nb_dim = dim
    return best_model,best_auc,opt_nb_dim

In [30]:
m,auc,dim = auto_model_selection(
    X=X,
    y=y,
    dimension_range=range(10,20))
m,auc,dim

KeyboardInterrupt: 