<a href="https://colab.research.google.com/github/adrian-alejandro/autoML/blob/main/AutoML_Practico_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.4-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 5.2 MB/s 
Collecting alembic>=1.5.0
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 48.3 MB/s 
Collecting cliff
  Downloading cliff-4.1.0-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 10.4 MB/s 
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 8.0 MB/s 
Collecting stevedore>=2.0.1
  Downloading stevedore-4.1.1-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.5 MB/s 
[?25hCollecting autopage>=0.4.0
  Downloading autopage-0.5.1-py3-none-any.whl (2

In [47]:
import os
from io import BytesIO
import requests
import pandas as pd
import numpy as np
import optuna

from sklearn import datasets
from sklearn import model_selection
from sklearn import svm, naive_bayes, linear_model
from sklearn.metrics import balanced_accuracy_score, make_scorer

In [40]:
vectorized_dataset_url = "https://raw.githubusercontent.com/adrian-alejandro/Busqueda-y-Recomendacion-para-Textos-Legales-Mentoria-2022/main/embeddings/vectorized_dataset_X_y.npz"
processed_data_url = "https://raw.githubusercontent.com/adrian-alejandro/Busqueda-y-Recomendacion-para-Textos-Legales-Mentoria-2022/main/embeddings/processed_dataset.csv"

In [32]:
vector_file = os.path.split(vectorized_dataset_url)[1]
dataset = os.path.split(processed_data_url)[1]

In [45]:
def read_npz_from_url(url):
  """Function that reads a npz file from a URL and overrides np.load method to
   force allow_pickle.
   Ref: https://stackoverflow.com/questions/55890813/how-to-fix-object-arrays-cannot-be-loaded-when-allow-pickle-false-for-imdb-loa
  """
  # save np.load
  np_load_old = np.load

  # modify the default parameters of np.load
  np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

  r = requests.get(url, stream=True)

  # call load_data with allow_pickle implicitly set to true
  vectors = np.load(BytesIO(r.raw.read()))

  # restore np.load for future normal usage
  np.load = np_load_old

  return vectors

In [46]:
vectors = read_npz_from_url(vectorized_dataset_url)

# Texto vectorizado
X = vectors['X']
                                            

# Etiquetas (fueros)
y = vectors['y']

In [39]:
data = pd.read_csv(processed_data_url, sep='|', encoding='utf-8')
data.head()

Unnamed: 0,archivo,fuero,texto_clean
0,9 BAEZ-FLECHA BUS.pdf.txt,LABORAL,"['sala', 'laboral', 'tribunal', 'superior', 'p..."
1,90 FUNES-COYSPU.pdf.txt,LABORAL,"['sala', 'laboral', 'tribunal', 'superior', 'p..."
2,1 QUINTEROS-CONSOLIDAR.pdf.txt,LABORAL,"['sala', 'laboral', 'tribunal', 'superior', 'p..."
3,3 SANGUEDOLCE-MUNICIPALIDAD DE VILLA ALLENDE.p...,LABORAL,"['sala', 'laboral', 'tribunal', 'superior', 'p..."
4,188 LUCIANO-NICOLAS.pdf.txt,LABORAL,"['sala', 'laboral', 'tribunal', 'superior', 'p..."


In [None]:
#Step 1. Define an objective function to be maximized.
def objective(trial):

    classifier_name = trial.suggest_categorical("classifier", ["LogReg", "SVM", "MNB"])
    
    # Step 2. Setup values for the hyperparameters:
    if classifier_name == 'LogReg':
      # Seach space
      logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10, log=True)
      logreg_solver = trial.suggest_categorical("logreg_solver", ['newton-cg', 'lbfgs', 'sag', 'saga'])
      logreg_fit_intercept = trial.suggest_categorical("logreg_intercept", [False, True])
      logreg_penalty = trial.suggest_categorical("logreg_penalty", ['l1', 'l2', 'elasticnet', 'none'])
      # Estimator
      classifier_obj = linear_model.LogisticRegression(
          C=logreg_c,
          solver=logreg_solver,
          fit_intercept=logreg_fit_intercept,
          penalty=logreg_penalty)
    elif classifier_name == "SVM":
      # Search space
      svm_c = trial.suggest_float("svm_c", 1e-1, 1e1)
      svm_kernel = trial.suggest_categorical("svm_kernel", ['poly', 'linear', 'rbf', 'sigmoid'])
      svm_gamma = trial.suggest_categorical("svm_gamma", ['scale', 'auto'])
      svm_shrinking = trial.suggest_categorical("svm_shrinking", [True, False])
      svm_break_ties = trial.suggest_categorical("svm_break_ties", [True, False])
      svm_decision_function_shape = trial.suggest_categorical("svm_decision_function_shape", ['ovo', 'ovr'])
      # Estimator
      classifier_obj = svm.SVC(
            C=svm_c, 
            kernel=svm_kernel,
            gamma=svm_gamma,
            shrinking=svm_shrinking,
            break_ties=svm_break_ties,
            decision_function_shape=svm_decision_function_shape)
    else:
      # Search space
      mnb_alpha = trial.suggest_float("mnb_alpha", 1e-1, 1e1)
      mnb_fit_prior = trial.suggest_categorical("mnb_fit_prior", [True, False])
      # Estimator
      classifier_obj = naive_bayes.MultinomialNB(
          alpha=mnb_alpha,
          fit_prior=mnb_fit_prior)

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(
        classifier_obj, 
        X,
        y,
        n_jobs=-1,
        cv=5,
        scoring=make_scorer(balanced_accuracy_score))
    balanced_accuracy = score.mean()
    return balanced_accuracy

study = optuna.create_study(direction="maximize")

study.optimize(objective, n_trials=50)


best_trial = study.best_trial

print(best_trial.params )
print(best_trial.value)

[32m[I 2022-12-10 22:26:36,728][0m A new study created in memory with name: no-name-d05652a5-3d6b-4bc0-97ec-1c428e0e4039[0m
[32m[I 2022-12-10 22:26:42,105][0m Trial 0 finished with value: 0.9630000000000001 and parameters: {'classifier': 'LogReg', 'logreg_c': 19.181756661149585, 'logreg_solver': 'newton-cg', 'logreg_intercept': False, 'logreg_penalty': 'none'}. Best is trial 0 with value: 0.9630000000000001.[0m
[32m[I 2022-12-10 22:27:18,302][0m Trial 1 finished with value: 0.9713333333333335 and parameters: {'classifier': 'LogReg', 'logreg_c': 4250632642.9724655, 'logreg_solver': 'sag', 'logreg_intercept': False, 'logreg_penalty': 'l2'}. Best is trial 1 with value: 0.9713333333333335.[0m
[32m[I 2022-12-10 22:27:18,583][0m Trial 2 finished with value: 0.7134253246753247 and parameters: {'classifier': 'MNB', 'mnb_alpha': 4.9026558464835555, 'mnb_fit_prior': False}. Best is trial 1 with value: 0.9713333333333335.[0m
[32m[I 2022-12-10 22:27:21,164][0m Trial 3 finished with v

In [None]:
from optuna.visualization import plot_optimization_history

plot_optimization_history(study).show()

