### 0. Libraries and utilities

In [None]:
import re
import unicodedata
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize the text
    tokens = text.split()

    # Remove English stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Join the tokens back into a single string
    text = ' '.join(tokens)

    return text

# Limpiamos el texto
preprocess_text('What can I say, I love this place')

'say love place'

### 1. Preprocessing

In [None]:
# data_path = r'Data\Gungor_2018_VictorianAuthorAttribution_data-train.csv'
# df = pd.read_csv(data_path, encoding='latin-1')

## URL from github repo, load as dataframe
url = 'https://raw.githubusercontent.com/ccsarmientot/text_author_classifier/master/datasets/sample_victorian.parquet'
# url = 'datasets/sample_victorian.parquet'
df = pd.read_parquet(url)

print(f'Shape of dataframe: {df.shape}')
df.head(5)

Shape of dataframe: (10000, 2)


Unnamed: 0,text,author
28172,now when nobody else was to be had and no high...,26
4098,said to me john you was always honorable and i...,8
21493,not see the lady s face until the marriage day...,20
16864,so you have come at last yes here i am and how...,15
2727,night what jack you be a soldier yes if you th...,4


In [None]:
avg_chr = np.mean(df['text'].apply(len))
f'Cantidad promedio de caracteres por texto: {avg_chr:,.2f}'

'Cantidad promedio de caracteres por texto: 4,945.23'

In [None]:
avg_chr = np.mean(df['text'].apply(lambda x: len(x.split(' '))))
f'Cantidad promedio de palabras por texto: {avg_chr:,.2f}'

'Cantidad promedio de palabras por texto: 1,001.00'

In [None]:
## Se identifica un desbalance de clases:
df['author'].value_counts()

Unnamed: 0_level_0,count
author,Unnamed: 1_level_1
8,1336
26,869
14,501
21,448
37,441
39,434
45,424
33,347
48,342
19,330


In [None]:
# ## Getting sample fo
# df_sample = df.sample(10_000)
# n_authors = df_sample['author'].nunique()
# print(f'Authors in df_sample: {n_authors}')
# df_sample.head(5)

## 2. Modelling

In [None]:
# Importamos librerias
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV

results = []
X = df['text']
y = df['author']

# Dividimos los datos en entrenamiento y testeo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
## Se aplica el preprocesameinto

# 1. Feature Extraction
tfidf = TfidfVectorizer(max_features=1000, preprocessor=preprocess_text)
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

# 2. Oversampling
oversample = RandomOverSampler()
X_train, y_train = oversample.fit_resample(X_train, y_train)

### 2.0 Define function to iterate over

In [None]:
def train_cv_models(model_name:str, classifier,
                    param_grid:dict[list],
                    X_train, y_train) -> dict:

    # Creacion del pipeline del modelo inicial
    model = Pipeline(steps=[
        ## Se aplica el modelo
        (model_name, classifier)
    ])

    ######################## PRIMERA BÚSQUEDA DE PARÁMETROS ####################
    print(' Primera búsqueda de parámetros '.center(80, '#'))

    # Creamos el objeto GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, verbose=2)

    # Ajustamos el modelo a los datos de entrenamiento
    grid_search.fit(X_train, y_train)

    # Obtenemos los mejores parámetros
    best_params = grid_search.best_params_
    print(f'Resultados primera búsqueda: {best_params}', '\n')

    ######################## SEGUNDA BÚSQUEDA DE PARÁMETROS ####################
    print(' Segunda búsqueda de parámetros '.center(80, '#'), '\n')


    # A los parámetros que deben ser enteros se resta y suma 1
    int_list = ['min_samples_split', 'max_depth', 'n_neighbors', 'min_samples_leaf']
    int_grid = {k:[v-1, v, v+1] for k,v in best_params.items() if any([i in k for i in int_list])}
    ## Clean zero values
    int_grid = {k:[c for c in v if c != 0] for k,v in int_grid.items()}

    # A los parámetros numéricos encontrados se resta y suma el 10 %
    int_params_grid = {k:[v-(v/10), v, v+(v/10)] for k,v in best_params.items() if isinstance(v, (float, int))}

    # A los parámetros en formato string encontrados se deja el mejor
    str_params_grid = {k:[v] for k,v in best_params.items() if not isinstance(v, list)}

    best_params_grid = {**str_params_grid, **int_params_grid, **int_grid}
    # best_params_grid['tfidf__max_features'] = [1000]

    print('DEBUG: params after transform: ', best_params_grid)

    # Creamos el objeto GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=best_params_grid, cv=2, verbose=2)

    # Ajustamos el modelo a los datos de entrenamiento
    grid_search.fit(X_train, y_train)

    # Obtenemos los mejores parámetros
    best_params = grid_search.best_params_
    print(f'Resultados segunda búsqueda: {best_params}', '\n')

    return best_params


def train_final_model(model_name, classifier, X_train, y_train) -> list:

    print(' Creando modelo final con los mejores parámetros '.center(80, '#'))

    # Creacion del pipeline modelo final
    final_model = Pipeline(steps=[
        ## Se aplica el modelo
        (model_name, classifier)
    ])

    # Ajuste del modelo
    final_model.fit(X_train, y_train)

    # Medimos el accuracy del modelo
    accuracy = final_model.score(X_test, y_test)
    print(f'Accuracy de {model_name}: {accuracy:,.2%}')

    # Se predicen las clases para test
    y_pred = final_model.predict(X_test)

    p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    return [model_name, accuracy, p, r, f1]


### 2.1 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Definimos el modelo a usar
classifier = LogisticRegression()
model_name = 'LogisticRegression'

# Definimos los parámetros a explorar
param_grid = {

    f'{model_name}__C': [0.1, 1, 10],
}

best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = LogisticRegression(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

######################## Primera búsqueda de parámetros ########################
Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END ..........................LogisticRegression__C=0.1; total time=   8.6s
[CV] END ..........................LogisticRegression__C=0.1; total time=   7.9s
[CV] END ............................LogisticRegression__C=1; total time=  18.4s
[CV] END ............................LogisticRegression__C=1; total time=  22.0s
[CV] END ...........................LogisticRegression__C=10; total time=  29.9s
[CV] END ...........................LogisticRegression__C=10; total time=  27.2s
Resultados primera búsqueda: {'LogisticRegression__C': 10} 

######################## Segunda búsqueda de parámetros ######################## 

DEBUG: params after transform:  {'LogisticRegression__C': [9.0, 10, 11.0]}
Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END ..........................LogisticRegression__C=9.0; total time=  30.0s
[CV] END ......

### 2.2 Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Definimos el modelo a usar
classifier = MultinomialNB()
model_name = 'NaiveBayes'

# Definimos los parámetros a explorar
param_grid = {

    f'{model_name}__alpha': [0.1, 1, 10],
}


best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = MultinomialNB(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

######################## Primera búsqueda de parámetros ########################
Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END ..............................NaiveBayes__alpha=0.1; total time=   0.3s
[CV] END ..............................NaiveBayes__alpha=0.1; total time=   0.3s
[CV] END ................................NaiveBayes__alpha=1; total time=   0.3s
[CV] END ................................NaiveBayes__alpha=1; total time=   0.3s
[CV] END ...............................NaiveBayes__alpha=10; total time=   0.3s
[CV] END ...............................NaiveBayes__alpha=10; total time=   0.3s
Resultados primera búsqueda: {'NaiveBayes__alpha': 0.1} 

######################## Segunda búsqueda de parámetros ######################## 

DEBUG: params after transform:  {'NaiveBayes__alpha': [0.09000000000000001, 0.1, 0.11]}
Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END ..............NaiveBayes__alpha=0.09000000000000001; total time=   0.3s
[CV] 

### 2.3 KNeighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Definimos el modelo a usar
classifier = KNeighborsClassifier()
model_name = 'KNeighbors'

# Definimos los parámetros a explorar
param_grid = {

    f'{model_name}__n_neighbors': [3, 5],
    f'{model_name}__metric': ['euclidean', 'manhattan', 'cosine']
}


best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}
best_classifier = KNeighborsClassifier(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

######################## Primera búsqueda de parámetros ########################
Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] END KNeighbors__metric=euclidean, KNeighbors__n_neighbors=3; total time= 2.0min
[CV] END KNeighbors__metric=euclidean, KNeighbors__n_neighbors=3; total time= 1.9min
[CV] END KNeighbors__metric=euclidean, KNeighbors__n_neighbors=5; total time= 1.9min
[CV] END KNeighbors__metric=euclidean, KNeighbors__n_neighbors=5; total time= 1.9min
[CV] END KNeighbors__metric=manhattan, KNeighbors__n_neighbors=3; total time= 7.7min
[CV] END KNeighbors__metric=manhattan, KNeighbors__n_neighbors=3; total time= 7.6min
[CV] END KNeighbors__metric=manhattan, KNeighbors__n_neighbors=5; total time= 7.6min
[CV] END KNeighbors__metric=manhattan, KNeighbors__n_neighbors=5; total time= 7.6min
[CV] END KNeighbors__metric=cosine, KNeighbors__n_neighbors=3; total time= 2.0min
[CV] END KNeighbors__metric=cosine, KNeighbors__n_neighbors=3; total time= 1.9min
[CV] END KNeigh

### 2.4 Arboles de decisión

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Definimos el modelo a usar
classifier = DecisionTreeClassifier()
model_name = 'DecisionTree'

# Definimos los parámetros a explorar
param_grid = {

    f'{model_name}__criterion': ['gini', 'entropy'],
    f'{model_name}__max_depth': [5, 10],
    f'{model_name}__min_samples_split': [5, 10],
    f'{model_name}__min_samples_leaf': [2, 3]
}


best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = DecisionTreeClassifier(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

######################## Primera búsqueda de parámetros ########################
Fitting 2 folds for each of 16 candidates, totalling 32 fits
[CV] END DecisionTree__criterion=gini, DecisionTree__max_depth=5, DecisionTree__min_samples_leaf=2, DecisionTree__min_samples_split=5; total time=   3.0s
[CV] END DecisionTree__criterion=gini, DecisionTree__max_depth=5, DecisionTree__min_samples_leaf=2, DecisionTree__min_samples_split=5; total time=   4.1s
[CV] END DecisionTree__criterion=gini, DecisionTree__max_depth=5, DecisionTree__min_samples_leaf=2, DecisionTree__min_samples_split=10; total time=   3.3s
[CV] END DecisionTree__criterion=gini, DecisionTree__max_depth=5, DecisionTree__min_samples_leaf=2, DecisionTree__min_samples_split=10; total time=   3.2s
[CV] END DecisionTree__criterion=gini, DecisionTree__max_depth=5, DecisionTree__min_samples_leaf=3, DecisionTree__min_samples_split=5; total time=   3.0s
[CV] END DecisionTree__criterion=gini, DecisionTree__max_depth=5, DecisionTree__min_sa

### 2.5 Support Vector Machine

In [None]:
from sklearn.svm import SVC

# Definimos el modelo a usar
classifier = SVC()
model_name = 'SVC'

# Definimos los parámetros a explorar
param_grid = {

    f'{model_name}__C': [1, 10],
    f'{model_name}__kernel': ['linear', 'rbf'],
    f'{model_name}__gamma': [0.1, 1]
}


best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = SVC(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

######################## Primera búsqueda de parámetros ########################
Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV] END .......SVC__C=1, SVC__gamma=0.1, SVC__kernel=linear; total time=13.9min
[CV] END .......SVC__C=1, SVC__gamma=0.1, SVC__kernel=linear; total time=14.7min
[CV] END ..........SVC__C=1, SVC__gamma=0.1, SVC__kernel=rbf; total time=32.6min
[CV] END ..........SVC__C=1, SVC__gamma=0.1, SVC__kernel=rbf; total time=33.7min
[CV] END .........SVC__C=1, SVC__gamma=1, SVC__kernel=linear; total time=13.7min
[CV] END .........SVC__C=1, SVC__gamma=1, SVC__kernel=linear; total time=14.8min
[CV] END ............SVC__C=1, SVC__gamma=1, SVC__kernel=rbf; total time=16.9min
[CV] END ............SVC__C=1, SVC__gamma=1, SVC__kernel=rbf; total time=18.1min
[CV] END ......SVC__C=10, SVC__gamma=0.1, SVC__kernel=linear; total time=10.2min
[CV] END ......SVC__C=10, SVC__gamma=0.1, SVC__kernel=linear; total time=11.1min
[CV] END .........SVC__C=10, SVC__gamma=0.1, SVC_

### 2.6 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Definimos el modelo a usar
classifier = RandomForestClassifier()
model_name = 'RandomForest'

# Definimos los parámetros a explorar
param_grid = {

    f'{model_name}__n_estimators': [100, 200],
    f'{model_name}__max_depth': [5, 10],
    f'{model_name}__min_samples_split': [2, 5],
    f'{model_name}__min_samples_leaf': [2, 4]
}


best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = RandomForestClassifier(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

######################## Primera búsqueda de parámetros ########################
Fitting 2 folds for each of 16 candidates, totalling 32 fits
[CV] END RandomForest__max_depth=5, RandomForest__min_samples_leaf=2, RandomForest__min_samples_split=2, RandomForest__n_estimators=100; total time=  10.0s
[CV] END RandomForest__max_depth=5, RandomForest__min_samples_leaf=2, RandomForest__min_samples_split=2, RandomForest__n_estimators=100; total time=   9.8s
[CV] END RandomForest__max_depth=5, RandomForest__min_samples_leaf=2, RandomForest__min_samples_split=2, RandomForest__n_estimators=200; total time=  18.5s
[CV] END RandomForest__max_depth=5, RandomForest__min_samples_leaf=2, RandomForest__min_samples_split=2, RandomForest__n_estimators=200; total time=  19.2s
[CV] END RandomForest__max_depth=5, RandomForest__min_samples_leaf=2, RandomForest__min_samples_split=5, RandomForest__n_estimators=100; total time=  10.0s
[CV] END RandomForest__max_depth=5, RandomForest__min_samples_leaf=2, RandomFo

126 fits failed out of a total of 162.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/imblearn/pipeline.py", line 333, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  Fil

Resultados segunda búsqueda: {'RandomForest__max_depth': 11, 'RandomForest__min_samples_leaf': 1, 'RandomForest__min_samples_split': 2, 'RandomForest__n_estimators': 200} 

############### Creando modelo final con los mejores parámetros ################
Accuracy de RandomForest: 57.40%


### 2.7 MLP Classifier (Multi-Layer Perceptron)

In [None]:
from sklearn.neural_network import MLPClassifier

# Definimos el modelo a usar
classifier = MLPClassifier()
model_name = 'MLP'

# Definimos los parámetros a explorar
param_grid = {

    f'{model_name}__hidden_layer_sizes': [(100, 50), (50, 50)],
    f'{model_name}__activation': ['relu', 'tanh'],
    f'{model_name}__solver': ['adam', 'sgd'],
    f'{model_name}__alpha': [0.001, 0.01],
    f'{model_name}__learning_rate': ['constant', 'adaptive']
}

best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = MLPClassifier(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

### 2.8 XGBoost

In [None]:
import xgboost as xgb

# Definimos el modelo a usar
classifier = xgb.XGBClassifier()
model_name = 'XGB'

# Definimos los parámetros a explorar
param_grid = {

    f'{model_name}__max_depth': [3, 5],
    f'{model_name}__learning_rate': [0.1, 0.3],
    f'{model_name}__subsample': [0.8, 1.0],
    f'{model_name}__colsample_bytree': [0.8, 1.0]
}

best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = xgb.XGBClassifier(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

######################## Primera búsqueda de parámetros ########################
Fitting 2 folds for each of 16 candidates, totalling 32 fits
[CV] END XGB__colsample_bytree=0.8, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__subsample=0.8; total time=   0.1s
[CV] END XGB__colsample_bytree=0.8, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__subsample=0.8; total time=   0.0s
[CV] END XGB__colsample_bytree=0.8, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__subsample=1.0; total time=   0.0s
[CV] END XGB__colsample_bytree=0.8, XGB__learning_rate=0.1, XGB__max_depth=3, XGB__subsample=1.0; total time=   0.0s
[CV] END XGB__colsample_bytree=0.8, XGB__learning_rate=0.1, XGB__max_depth=5, XGB__subsample=0.8; total time=   0.0s
[CV] END XGB__colsample_bytree=0.8, XGB__learning_rate=0.1, XGB__max_depth=5, XGB__subsample=0.8; total time=   0.0s
[CV] END XGB__colsample_bytree=0.8, XGB__learning_rate=0.1, XGB__max_depth=5, XGB__subsample=1.0; total time=   0.0s
[CV] END XGB__colsample_bytree=0.8, XGB

ValueError: 
All the 32 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
32 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/imblearn/pipeline.py", line 333, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44], got [ 1  2  3  4  6  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
 27 28 29 30 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 48 50]


## 3. Results

In [None]:
res_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1_score'])
res_df