### 0. Libraries and utilities

In [None]:
import re
import unicodedata
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize the text
    tokens = text.split()

    # Remove English stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Join the tokens back into a single string
    text = ' '.join(tokens)

    return text

# Limpiamos el texto
preprocess_text('What can I say, I love this place')

### 1. Preprocessing

In [None]:
# data_path = r'Data\Gungor_2018_VictorianAuthorAttribution_data-train.csv'
# df = pd.read_csv(data_path, encoding='latin-1')

## URL from github repo, load as dataframe
url = 'https://raw.githubusercontent.com/ccsarmientot/text_author_classifier/master/datasets/sample_victorian.parquet'
url = 'datasets/sample_victorian.parquet'
df = pd.read_parquet(url)

## To comment
df = df.sample(500)

print(f'Shape of dataframe: {df.shape}')
df.head(5)

In [None]:
avg_chr = np.mean(df['text'].apply(len))
f'Cantidad promedio de caracteres por texto: {avg_chr:,.2f}'

In [None]:
avg_chr = np.mean(df['text'].apply(lambda x: len(x.split(' '))))
f'Cantidad promedio de palabras por texto: {avg_chr:,.2f}'

In [None]:
## Se identifica un desbalance de clases:
df['author'].value_counts()

In [7]:
# ## Getting sample fo 
# df_sample = df.sample(10_000)
# n_authors = df_sample['author'].nunique()
# print(f'Authors in df_sample: {n_authors}')
# df_sample.head(5)

## 2. Modelling

In [8]:
# Importamos librerias
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

results = []
X = df['text']
y = df['author']

# Dividimos los datos en entrenamiento y testeo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 2.1 Logistic Regression

In [9]:
# Creacion del pipeline del modelo inicial
model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("logit", LogisticRegression())
])

#### 2.1.1 Primera iteración grid search

In [None]:
from sklearn.model_selection import GridSearchCV

# Definimos los parámetros a explorar
param_grid = {
    'tfidf__max_features': [1000],
    'logit__C': [0.1, 1, 10],
}

# Creamos el objeto GridSearchCV
grid_search = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=5,
                          verbose=2)

# Ajustamos el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtenemos los mejores parámetros
best_params = grid_search.best_params_
print(best_params)

#### 2.1.2 Segunda iteración grid search

In [None]:
# # Definimos los parámetros a explorar
# param_grid = {
#     'tfidf__max_features': [1000],
#     'logit__C': [9, 10, 11],
# }

# # Creamos el objeto GridSearchCV
# grid_search = GridSearchCV(estimator=model,
#                           param_grid=param_grid,
#                           cv=5,
#                           verbose=2)

# # Ajustamos el modelo a los datos de entrenamiento
# grid_search.fit(X_train, y_train)

# # Obtenemos los mejores parámetros
# best_params = grid_search.best_params_
# print(best_params)

#### 2.1.3 Modelo final

In [12]:
# Creacion del pipeline
final_model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(max_features=500, preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("logit", LogisticRegression(C=0.1))
])

In [None]:
# Ajustamos el modelo
final_model.fit(X_train, y_train)

In [None]:
# Medimos el accuracy del modelo
accuracy = final_model.score(X_test, y_test)
accuracy

In [16]:
# Se predicen las clases para test
y_pred = final_model.predict(X_test)

#### 2.1.4 Save metrics

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
model_name = 'Logistic_regresion'
results.append([model_name, accuracy, p, r, f1])

### 2.2 Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Creacion del pipeline del modelo inicial
model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("naive", MultinomialNB())
])

#### 2.2.1 Primera iteración grid search

In [None]:
from sklearn.model_selection import GridSearchCV

# Definimos los parámetros a explorar
param_grid = {
    'tfidf__max_features': [1000],
    'naive__alpha': [0.1, 1, 10],
}

# Creamos el objeto GridSearchCV
grid_search = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=5,
                          verbose=2)

# Ajustamos el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtenemos los mejores parámetros
best_params = grid_search.best_params_
print(best_params)

#### 2.2.2 Segunda iteración grid search

In [None]:
# # Definimos los parámetros a explorar
# param_grid = {
#     'tfidf__max_features': [1000],
#     'naive__alpha': [0.1, 1, 10],
# }

# # Creamos el objeto GridSearchCV
# grid_search = GridSearchCV(estimator=model,
#                           param_grid=param_grid,
#                           cv=5,
#                           verbose=2)

# # Ajustamos el modelo a los datos de entrenamiento
# grid_search.fit(X_train, y_train)

# # Obtenemos los mejores parámetros
# best_params = grid_search.best_params_
# print(best_params)

#### 2.2.3 Modelo final

In [None]:
# Creacion del pipeline
final_model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(max_features=500, preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("naive", MultinomialNB(alpha=0.1))
])

In [None]:
# Ajustamos el modelo
final_model.fit(X_train, y_train)

In [None]:
# Medimos el accuracy del modelo
accuracy = final_model.score(X_test, y_test)
accuracy

In [None]:
# Se predicen las clases para test
y_pred = final_model.predict(X_test)

#### 2.2.4 Save metrics

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
model_name = 'NaiveBayes'
results.append([model_name, accuracy, p, r, f1])

### 2.3 KNeighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Creacion del pipeline del modelo inicial
model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("kneig", KNeighborsClassifier())
])

#### 2.3.1 Primera iteración grid search

In [None]:
from sklearn.model_selection import GridSearchCV

# Definimos los parámetros a explorar
param_grid = {
    'tfidf__max_features': [1000],
    'kneig__n_neighbors': [3, 5],
    'kneig__metric': ['euclidean', 'manhattan', 'cosine']
}

# Creamos el objeto GridSearchCV
grid_search = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=5,
                          verbose=2)

# Ajustamos el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtenemos los mejores parámetros
best_params = grid_search.best_params_
print(best_params)

#### 2.3.2 Segunda iteración grid search

In [None]:
# # Definimos los parámetros a explorar
# param_grid = {
#     'tfidf__max_features': [1000],
#     'kneig__n_neighbors': [3, 5],
#     'kneig__metric': ['euclidean', 'manhattan', 'cosine']
# }


# # Creamos el objeto GridSearchCV
# grid_search = GridSearchCV(estimator=model,
#                           param_grid=param_grid,
#                           cv=5,
#                           verbose=2)

# # Ajustamos el modelo a los datos de entrenamiento
# grid_search.fit(X_train, y_train)

# # Obtenemos los mejores parámetros
# best_params = grid_search.best_params_
# print(best_params)

#### 2.3.3 Modelo final

In [None]:
# Creacion del pipeline
final_model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(max_features=500, preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("kneig", KNeighborsClassifier(n_neighbors=3, metric='euclidean'))
])

In [None]:
# Ajustamos el modelo
final_model.fit(X_train, y_train)

In [None]:
# Medimos el accuracy del modelo
accuracy = final_model.score(X_test, y_test)
accuracy

In [None]:
# Se predicen las clases para test
y_pred = final_model.predict(X_test)

#### 2.3.4 Save metrics

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
model_name = 'K-neighbors'
results.append([model_name, accuracy, p, r, f1])

### 2.4 Arboles de decisión

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Creacion del pipeline del modelo inicial
model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("tree", DecisionTreeClassifier())
])

#### 2.4.1 Primera iteración grid search

In [None]:
from sklearn.model_selection import GridSearchCV

# Definimos los parámetros a explorar
param_grid = {
    'tfidf__max_features': [1000],
    'tree__criterion': ['gini', 'entropy'],
    'tree__max_depth': [5, 10],
    'tree__min_samples_split': [5, 10],
    'tree__min_samples_leaf': [1, 3]
}

# Creamos el objeto GridSearchCV
grid_search = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=5,
                          verbose=2)

# Ajustamos el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtenemos los mejores parámetros
best_params = grid_search.best_params_
print(best_params)

#### 2.4.2 Segunda iteración grid search

In [None]:
# # Definimos los parámetros a explorar
# # Definimos los parámetros a explorar
# param_grid = {
#     'tfidf__max_features': [1000],
#     'tree__criterion': ['gini', 'entropy'],
#     'tree__max_depth': [5, 10],
#     'tree__min_samples_split': [5, 10],
#     'tree__min_samples_leaf': [1, 3]
# }



# # Creamos el objeto GridSearchCV
# grid_search = GridSearchCV(estimator=model,
#                           param_grid=param_grid,
#                           cv=5,
#                           verbose=2)

# # Ajustamos el modelo a los datos de entrenamiento
# grid_search.fit(X_train, y_train)

# # Obtenemos los mejores parámetros
# best_params = grid_search.best_params_
# print(best_params)

#### 2.3.3 Modelo final

In [None]:
# Creacion del pipeline
final_model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(max_features=500, preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("tree", DecisionTreeClassifier(max_depth=5, min_samples_split=5, 
                                    min_samples_leaf=3))
])

In [None]:
# Ajustamos el modelo
final_model.fit(X_train, y_train)

In [None]:
# Medimos el accuracy del modelo
accuracy = final_model.score(X_test, y_test)
accuracy

In [None]:
# Se predicen las clases para test
y_pred = final_model.predict(X_test)

#### 2.3.4 Save metrics

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
model_name = 'Decision Trees'
results.append([model_name, accuracy, p, r, f1])

### 2.5 Support Vector Machine

In [None]:
from sklearn.svm import SVC

# Creacion del pipeline del modelo inicial
model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("SVC", SVC())
])

#### 2.5.1 Primera iteración grid search

In [None]:
from sklearn.model_selection import GridSearchCV

# Definimos los parámetros a explorar
param_grid = {
    'tfidf__max_features': [1000],
    'SVC__C': [0.1, 1, 10, 100],
    'SVC__kernel': ['linear', 'rbf'],
    'SVC__gamma': [0.1, 1, 10]
}

# Creamos el objeto GridSearchCV
grid_search = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=5,
                          verbose=2)

# Ajustamos el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtenemos los mejores parámetros
best_params = grid_search.best_params_
print(best_params)

#### 2.5.2 Segunda iteración grid search

In [None]:
# # Definimos los parámetros a explorar
# # Definimos los parámetros a explorar
# param_grid = {
#     'tfidf__max_features': [1000],
#     'SVC__C': [0.1, 1, 10, 100],
#     'SVC__kernel': ['linear', 'rbf'],
#     'SVC__gamma': [0.1, 1, 10]
# }


# # Creamos el objeto GridSearchCV
# grid_search = GridSearchCV(estimator=model,
#                           param_grid=param_grid,
#                           cv=5,
#                           verbose=2)

# # Ajustamos el modelo a los datos de entrenamiento
# grid_search.fit(X_train, y_train)

# # Obtenemos los mejores parámetros
# best_params = grid_search.best_params_
# print(best_params)

#### 2.5.3 Modelo final

In [None]:
# Creacion del pipeline
final_model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(max_features=500, preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("SVC", SVC(C=0.1, kernel='linear', gamma=0.1))
])

In [None]:
# Ajustamos el modelo
final_model.fit(X_train, y_train)

In [None]:
# Medimos el accuracy del modelo
accuracy = final_model.score(X_test, y_test)
accuracy

In [None]:
# Se predicen las clases para test
y_pred = final_model.predict(X_test)

#### 2.5.4 Save metrics

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
model_name = 'SVC'
results.append([model_name, accuracy, p, r, f1])

### 2.6 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Creacion del pipeline del modelo inicial
model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("rforest", RandomForestClassifier())
])

#### 2.3.1 Primera iteración grid search

In [None]:
from sklearn.model_selection import GridSearchCV

# Definimos los parámetros a explorar
param_grid = {
    'tfidf__max_features': [1000],
    'rforest__n_estimators': [100, 200],
    'rforest__max_depth': [5, 10],
    'rforest__min_samples_split': [2, 5],
    'rforest__min_samples_leaf': [2, 4]
}

# Creamos el objeto GridSearchCV
grid_search = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=5,
                          verbose=2)

# Ajustamos el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtenemos los mejores parámetros
best_params = grid_search.best_params_
print(best_params)

#### 2.3.2 Segunda iteración grid search

In [None]:
# # Definimos los parámetros a explorar
# param_grid = {
#     'tfidf__max_features': [1000],
#     'rforest__n_estimators': [100, 200],
#     'rforest__max_depth': [5, 10],
#     'rforest__min_samples_split': [2, 5],
#     'rforest__min_samples_leaf': [2, 4]
# }


# # Creamos el objeto GridSearchCV
# grid_search = GridSearchCV(estimator=model,
#                           param_grid=param_grid,
#                           cv=5,
#                           verbose=2)

# # Ajustamos el modelo a los datos de entrenamiento
# grid_search.fit(X_train, y_train)

# # Obtenemos los mejores parámetros
# best_params = grid_search.best_params_
# print(best_params)

#### 2.3.3 Modelo final

In [None]:
# Creacion del pipeline
final_model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(max_features=500, preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("rforest", RandomForestClassifier(n_estimators=100, max_depth=5, 
                                       min_samples_split=2, min_samples_leaf=2))
])

In [None]:
# Ajustamos el modelo
final_model.fit(X_train, y_train)

In [None]:
# Medimos el accuracy del modelo
accuracy = final_model.score(X_test, y_test)
accuracy

In [None]:
# Se predicen las clases para test
y_pred = final_model.predict(X_test)

#### 2.3.4 Save metrics

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
model_name = 'Random Forest'
results.append([model_name, accuracy, p, r, f1])

### 2.7 XGBoost

In [None]:
import xgboost as xgb

# Creacion del pipeline del modelo inicial
model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("XGB", xgb())
])

#### 2.7.1 Primera iteración grid search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'tfidf__max_features': [1000],
    'XGB__max_depth': [3, 5],
    'XGB__learning_rate': [0.1, 0.3],
    'XGB__subsample': [0.8, 1.0],
    'XGB__colsample_bytree': [0.8, 1.0]
}

# Creamos el objeto GridSearchCV
grid_search = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=5,
                          verbose=2)

# Ajustamos el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtenemos los mejores parámetros
best_params = grid_search.best_params_
print(best_params)

#### 2.7.2 Segunda iteración grid search

In [None]:
# # Definimos los parámetros a explorar
# param_grid = {
#     'tfidf__max_features': [1000],
#     'XGB__max_depth': [3, 5],
#     'XGB__learning_rate': [0.1, 0.3],
#     'XGB__subsample': [0.8, 1.0],
#     'XGB__colsample_bytree': [0.8, 1.0]
# }

# # Creamos el objeto GridSearchCV
# grid_search = GridSearchCV(estimator=model,
#                           param_grid=param_grid,
#                           cv=5,
#                           verbose=2)

# # Ajustamos el modelo a los datos de entrenamiento
# grid_search.fit(X_train, y_train)

# # Obtenemos los mejores parámetros
# best_params = grid_search.best_params_
# print(best_params)

#### 2.7.3 Modelo final

In [None]:
# Creacion del pipeline
final_model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(max_features=500, preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("XGB", xgb(max_depth=3, learning_rate=0.1, subsample=0.8, colsample_bytree=0-8))
])

In [None]:
# Ajustamos el modelo
final_model.fit(X_train, y_train)

In [None]:
# Medimos el accuracy del modelo
accuracy = final_model.score(X_test, y_test)
accuracy

In [None]:
# Se predicen las clases para test
y_pred = final_model.predict(X_test)

#### 2.7.4 Save metrics

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
model_name = 'XGBoost'
results.append([model_name, accuracy, p, r, f1])

### 2.8 MLP Classifier (Multi-Layer Perceptron)

In [None]:
from sklearn.neural_network import MLPClassifier

# Creacion del pipeline del modelo inicial
model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("MLP", MLPClassifier())
])

#### 2.8.1 Primera iteración grid search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'tfidf__max_features': [1000],
    'MLP__hidden_layer_sizes': [(100, 50), (50, 50)],
    'MLP__activation': ['relu', 'tanh'],
    'MLP__solver': ['adam', 'sgd'],
    'MLP__alpha': [0.001, 0.01],
    'MLP__learning_rate': ['constant', 'adaptive']
}

# Creamos el objeto GridSearchCV
grid_search = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=5,
                          verbose=2)

# Ajustamos el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtenemos los mejores parámetros
best_params = grid_search.best_params_
print(best_params)

#### 2.8.2 Segunda iteración grid search

In [None]:
# # Definimos los parámetros a explorar
# param_grid = {
#     'tfidf__max_features': [1000],
#     'MLP__hidden_layer_sizes': [(100, 50), (50, 50)],
#     'MLP__activation': ['relu', 'tanh'],
#     'MLP__solver': ['adam', 'sgd'],
#     'MLP__alpha': [0.001, 0.01],
#     'MLP__learning_rate': ['constant', 'adaptive']
# }


# # Creamos el objeto GridSearchCV
# grid_search = GridSearchCV(estimator=model,
#                           param_grid=param_grid,
#                           cv=5,
#                           verbose=2)

# # Ajustamos el modelo a los datos de entrenamiento
# grid_search.fit(X_train, y_train)

# # Obtenemos los mejores parámetros
# best_params = grid_search.best_params_
# print(best_params)

#### 2.8.3 Modelo final

In [None]:
# Creacion del pipeline
final_model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(max_features=500, preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("MLP", MLPClassifier(hidden_layer_sizes=(50, 50), activation='relu', 
                          solver='sgd', alpha=0.01, learning_rate='adaptative'))
])

In [None]:
# Ajustamos el modelo
final_model.fit(X_train, y_train)

In [None]:
# Medimos el accuracy del modelo
accuracy = final_model.score(X_test, y_test)
accuracy

In [None]:
# Se predicen las clases para test
y_pred = final_model.predict(X_test)

#### 2.8.4 Save metrics

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
model_name = 'MLP'
results.append([model_name, accuracy, p, r, f1])

## 3. Results

In [None]:
res_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1_score'])
res_df