### 0. Libraries and utilities

In [1]:
import re
import unicodedata
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ccsar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize the text
    tokens = text.split()

    # Remove English stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Join the tokens back into a single string
    text = ' '.join(tokens)

    return text

# Limpiamos el texto
preprocess_text('What can I say, I love this place')

'say love place'

### 1. Preprocessing

In [3]:
# data_path = r'Data\Gungor_2018_VictorianAuthorAttribution_data-train.csv'
# df = pd.read_csv(data_path, encoding='latin-1')

## URL from github repo, load as dataframe
url = 'https://raw.githubusercontent.com/ccsarmientot/text_author_classifier/master/datasets/sample_victorian.parquet'
url = 'datasets/sample_victorian.parquet'
df = pd.read_parquet(url)

print(f'Shape of dataframe: {df.shape}')
df.head(5)

Shape of dataframe: (500, 2)


Unnamed: 0,text,author
49339,into her simultaneously from the the and the u...,45
45580,his heels holding a pan over a camp fire he lo...,42
51146,which has the atlantic still it has exhibited ...,48
37861,l of the region to something more from them in...,36
193,the train was starting wrung his companion s h...,1


In [4]:
avg_chr = np.mean(df['text'].apply(len))
f'Cantidad promedio de caracteres por texto: {avg_chr:,.2f}'

'Cantidad promedio de caracteres por texto: 4,963.65'

In [5]:
avg_chr = np.mean(df['text'].apply(lambda x: len(x.split(' '))))
f'Cantidad promedio de palabras por texto: {avg_chr:,.2f}'

'Cantidad promedio de palabras por texto: 1,001.00'

In [6]:
## Se identifica un desbalance de clases:
df['author'].value_counts()

author
8     73
26    37
14    27
37    25
39    24
45    24
21    20
33    20
9     14
41    14
19    13
15    13
32    12
38    12
48    12
25    11
43    10
30     9
4      9
46     9
10     9
35     8
50     8
44     8
1      8
20     7
18     7
17     6
36     6
29     5
42     5
28     5
24     4
12     4
3      3
2      3
22     3
11     2
27     2
23     2
13     2
34     2
6      2
40     1
Name: count, dtype: int64

In [7]:
# ## Getting sample fo 
# df_sample = df.sample(10_000)
# n_authors = df_sample['author'].nunique()
# print(f'Authors in df_sample: {n_authors}')
# df_sample.head(5)

## 2. Modelling

In [8]:
# Importamos librerias
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV

results = []
X = df['text']
y = df['author']

# Dividimos los datos en entrenamiento y testeo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
## Se aplica el preprocesameinto

# 1. Feature Extraction
tfidf = TfidfVectorizer(max_features=1000, preprocessor=preprocess_text)
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

# 2. Oversampling
oversample = RandomOverSampler()
X_train, y_train = oversample.fit_resample(X_train, y_train)

### 2.0 Define function to iterate over

In [10]:
def train_cv_models(model_name:str, classifier, 
                    param_grid:dict[list], 
                    X_train, y_train) -> dict:

    # Creacion del pipeline del modelo inicial
    model = Pipeline(steps=[
        ## Se aplica el modelo
        (model_name, classifier)
    ])

    ######################## PRIMERA BÚSQUEDA DE PARÁMETROS ####################
    print(' Primera búsqueda de parámetros '.center(80, '#'))

    # Creamos el objeto GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, verbose=2)

    # Ajustamos el modelo a los datos de entrenamiento
    grid_search.fit(X_train, y_train)

    # Obtenemos los mejores parámetros
    best_params = grid_search.best_params_
    print(f'Resultados primera búsqueda: {best_params}', '\n')

    ######################## SEGUNDA BÚSQUEDA DE PARÁMETROS ####################
    print(' Segunda búsqueda de parámetros '.center(80, '#'), '\n')
    

    # A los parámetros que deben ser enteros se resta y suma 1
    int_list = ['min_samples_split', 'max_depth', 'n_neighbors', 'min_samples_leaf']
    int_grid = {k:[v-1, v, v+1] for k,v in best_params.items() if any([i in k for i in int_list])}
    ## Clean zero values
    int_grid = {k:[c for c in v if c != 0] for k,v in int_grid.items()}

    # A los parámetros numéricos encontrados se resta y suma el 10 %
    int_params_grid = {k:[v-(v/10), v, v+(v/10)] for k,v in best_params.items() if isinstance(v, (float, int))}
    
    # A los parámetros en formato string encontrados se deja el mejor
    str_params_grid = {k:[v] for k,v in best_params.items() if not isinstance(v, list)}

    best_params_grid = {**str_params_grid, **int_params_grid, **int_grid}
    # best_params_grid['tfidf__max_features'] = [1000]

    print('DEBUG: params after transform: ', best_params_grid)

    # Creamos el objeto GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=best_params_grid, cv=2, verbose=2)

    # Ajustamos el modelo a los datos de entrenamiento
    grid_search.fit(X_train, y_train)

    # Obtenemos los mejores parámetros
    best_params = grid_search.best_params_
    print(f'Resultados segunda búsqueda: {best_params}', '\n')

    return best_params


def train_final_model(model_name, classifier, X_train, y_train) -> list:

    print(' Creando modelo final con los mejores parámetros '.center(80, '#'))
    
    # Creacion del pipeline modelo final
    final_model = Pipeline(steps=[
        ## Se aplica el modelo
        (model_name, classifier)
    ])

    # Ajuste del modelo
    final_model.fit(X_train, y_train)

    # Medimos el accuracy del modelo
    accuracy = final_model.score(X_test, y_test)
    print(f'Accuracy de {model_name}: {accuracy:,.2%}')

    # Se predicen las clases para test
    y_pred = final_model.predict(X_test)

    p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    return [model_name, accuracy, p, r, f1]


### 2.1 Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

# Definimos el modelo a usar
classifier = LogisticRegression()
model_name = 'LogisticRegression'

# Definimos los parámetros a explorar
param_grid = {
    
    f'{model_name}__C': [0.1, 1, 10],
}

best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = LogisticRegression(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

######################## Primera búsqueda de parámetros ########################
Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END ..........................LogisticRegression__C=0.1; total time=   0.7s
[CV] END ..........................LogisticRegression__C=0.1; total time=   0.3s
[CV] END ............................LogisticRegression__C=1; total time=   3.6s
[CV] END ............................LogisticRegression__C=1; total time=   2.0s
[CV] END ...........................LogisticRegression__C=10; total time=   3.2s
[CV] END ...........................LogisticRegression__C=10; total time=   3.2s
Resultados primera búsqueda: {'LogisticRegression__C': 10} 

######################## Segunda búsqueda de parámetros ######################## 

DEBUG: params after transform:  {'LogisticRegression__C': [9.0, 10, 11.0]}
Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END ..........................LogisticRegression__C=9.0; total time=   1.1s
[CV] END ......

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 2.2 Naive Bayes

In [12]:
from sklearn.naive_bayes import MultinomialNB

# Definimos el modelo a usar
classifier = MultinomialNB()
model_name = 'NaiveBayes'

# Definimos los parámetros a explorar
param_grid = {
    
    f'{model_name}__alpha': [0.1, 1, 10],
}


best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = MultinomialNB(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

######################## Primera búsqueda de parámetros ########################
Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END ..............................NaiveBayes__alpha=0.1; total time=   0.0s
[CV] END ..............................NaiveBayes__alpha=0.1; total time=   0.0s
[CV] END ................................NaiveBayes__alpha=1; total time=   0.0s
[CV] END ................................NaiveBayes__alpha=1; total time=   0.0s
[CV] END ...............................NaiveBayes__alpha=10; total time=   0.0s
[CV] END ...............................NaiveBayes__alpha=10; total time=   0.0s
Resultados primera búsqueda: {'NaiveBayes__alpha': 0.1} 

######################## Segunda búsqueda de parámetros ######################## 

DEBUG: params after transform:  {'NaiveBayes__alpha': [0.09000000000000001, 0.1, 0.11]}
Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END ..............NaiveBayes__alpha=0.09000000000000001; total time=   0.0s
[CV] 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 2.3 KNeighbors

In [13]:
from sklearn.neighbors import KNeighborsClassifier

# Definimos el modelo a usar
classifier = KNeighborsClassifier()
model_name = 'KNeighbors'

# Definimos los parámetros a explorar
param_grid = {
    
    f'{model_name}__n_neighbors': [3, 5],
    f'{model_name}__metric': ['euclidean', 'manhattan', 'cosine']
}


best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}
best_classifier = KNeighborsClassifier(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

######################## Primera búsqueda de parámetros ########################
Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] END KNeighbors__metric=euclidean, KNeighbors__n_neighbors=3; total time=   0.5s
[CV] END KNeighbors__metric=euclidean, KNeighbors__n_neighbors=3; total time=   0.5s
[CV] END KNeighbors__metric=euclidean, KNeighbors__n_neighbors=5; total time=   0.6s
[CV] END KNeighbors__metric=euclidean, KNeighbors__n_neighbors=5; total time=   0.6s
[CV] END KNeighbors__metric=manhattan, KNeighbors__n_neighbors=3; total time=   1.1s
[CV] END KNeighbors__metric=manhattan, KNeighbors__n_neighbors=3; total time=   0.7s
[CV] END KNeighbors__metric=manhattan, KNeighbors__n_neighbors=5; total time=   0.6s
[CV] END KNeighbors__metric=manhattan, KNeighbors__n_neighbors=5; total time=   0.6s
[CV] END KNeighbors__metric=cosine, KNeighbors__n_neighbors=3; total time=   0.5s
[CV] END KNeighbors__metric=cosine, KNeighbors__n_neighbors=3; total time=   0.5s
[CV] END KNeigh

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 2.4 Arboles de decisión

In [14]:
from sklearn.tree import DecisionTreeClassifier

# Definimos el modelo a usar
classifier = DecisionTreeClassifier()
model_name = 'DecisionTree'

# Definimos los parámetros a explorar
param_grid = {
    
    f'{model_name}__criterion': ['gini', 'entropy'],
    f'{model_name}__max_depth': [5, 10],
    f'{model_name}__min_samples_split': [5, 10],
    f'{model_name}__min_samples_leaf': [2, 3]
}


best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = DecisionTreeClassifier(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

######################## Primera búsqueda de parámetros ########################
Fitting 2 folds for each of 16 candidates, totalling 32 fits
[CV] END DecisionTree__criterion=gini, DecisionTree__max_depth=5, DecisionTree__min_samples_leaf=1, DecisionTree__min_samples_split=5; total time=   0.2s
[CV] END DecisionTree__criterion=gini, DecisionTree__max_depth=5, DecisionTree__min_samples_leaf=1, DecisionTree__min_samples_split=5; total time=   0.2s
[CV] END DecisionTree__criterion=gini, DecisionTree__max_depth=5, DecisionTree__min_samples_leaf=1, DecisionTree__min_samples_split=10; total time=   0.2s
[CV] END DecisionTree__criterion=gini, DecisionTree__max_depth=5, DecisionTree__min_samples_leaf=1, DecisionTree__min_samples_split=10; total time=   0.2s
[CV] END DecisionTree__criterion=gini, DecisionTree__max_depth=5, DecisionTree__min_samples_leaf=3, DecisionTree__min_samples_split=5; total time=   0.2s
[CV] END DecisionTree__criterion=gini, DecisionTree__max_depth=5, DecisionTree__min_sa

18 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ccsar\miniconda3\envs\env_nlp\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ccsar\miniconda3\envs\env_nlp\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ccsar\miniconda3\envs\env_nlp\Lib\site-packages\imblearn\pipeline.py", line 333, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "c:\Users\ccsar\mini

Resultados segunda búsqueda: {'DecisionTree__criterion': 'entropy', 'DecisionTree__max_depth': 11, 'DecisionTree__min_samples_leaf': 1, 'DecisionTree__min_samples_split': 4} 

############### Creando modelo final con los mejores parámetros ################
Accuracy de DecisionTree: 11.00%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 2.5 Support Vector Machine

In [15]:
from sklearn.svm import SVC

# Definimos el modelo a usar
classifier = SVC()
model_name = 'SVC'

# Definimos los parámetros a explorar
param_grid = {
    
    f'{model_name}__C': [1, 10],
    f'{model_name}__kernel': ['linear', 'rbf'],
    f'{model_name}__gamma': [0.1, 1]
}


best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = SVC(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

######################## Primera búsqueda de parámetros ########################
Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV] END .......SVC__C=1, SVC__gamma=0.1, SVC__kernel=linear; total time=   6.5s
[CV] END .......SVC__C=1, SVC__gamma=0.1, SVC__kernel=linear; total time=   5.3s
[CV] END ..........SVC__C=1, SVC__gamma=0.1, SVC__kernel=rbf; total time=  12.3s
[CV] END ..........SVC__C=1, SVC__gamma=0.1, SVC__kernel=rbf; total time=  13.2s
[CV] END .........SVC__C=1, SVC__gamma=1, SVC__kernel=linear; total time=   5.9s
[CV] END .........SVC__C=1, SVC__gamma=1, SVC__kernel=linear; total time=   6.7s
[CV] END ............SVC__C=1, SVC__gamma=1, SVC__kernel=rbf; total time=   5.5s
[CV] END ............SVC__C=1, SVC__gamma=1, SVC__kernel=rbf; total time=   8.4s
[CV] END ......SVC__C=10, SVC__gamma=0.1, SVC__kernel=linear; total time=   3.5s
[CV] END ......SVC__C=10, SVC__gamma=0.1, SVC__kernel=linear; total time=   3.4s
[CV] END .........SVC__C=10, SVC__gamma=0.1, SVC_

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 2.6 Random Forest

In [52]:
from sklearn.ensemble import RandomForestClassifier

# Definimos el modelo a usar
classifier = RandomForestClassifier()
model_name = 'RandomForest'

# Definimos los parámetros a explorar
param_grid = {
    
    f'{model_name}__n_estimators': [100, 200],
    f'{model_name}__max_depth': [5, 10],
    f'{model_name}__min_samples_split': [2, 5],
    f'{model_name}__min_samples_leaf': [2, 4]
}


best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = RandomForestClassifier(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

######################## Primera búsqueda de parámetros ########################
Fitting 2 folds for each of 16 candidates, totalling 32 fits
[CV] END RandomForest__max_depth=5, RandomForest__min_samples_leaf=2, RandomForest__min_samples_split=2, RandomForest__n_estimators=100; total time=   1.1s
[CV] END RandomForest__max_depth=5, RandomForest__min_samples_leaf=2, RandomForest__min_samples_split=2, RandomForest__n_estimators=100; total time=   1.1s
[CV] END RandomForest__max_depth=5, RandomForest__min_samples_leaf=2, RandomForest__min_samples_split=2, RandomForest__n_estimators=200; total time=   2.3s
[CV] END RandomForest__max_depth=5, RandomForest__min_samples_leaf=2, RandomForest__min_samples_split=2, RandomForest__n_estimators=200; total time=   2.1s
[CV] END RandomForest__max_depth=5, RandomForest__min_samples_leaf=2, RandomForest__min_samples_split=5, RandomForest__n_estimators=100; total time=   1.0s
[CV] END RandomForest__max_depth=5, RandomForest__min_samples_leaf=2, RandomFo

### 2.7 MLP Classifier (Multi-Layer Perceptron)

In [None]:
from sklearn.neural_network import MLPClassifier

# Definimos el modelo a usar
classifier = MLPClassifier()
model_name = 'MLP'

# Definimos los parámetros a explorar
param_grid = {
    
    f'{model_name}__hidden_layer_sizes': [(100, 50), (50, 50)],
    f'{model_name}__activation': ['relu', 'tanh'],
    f'{model_name}__solver': ['adam', 'sgd'],
    f'{model_name}__alpha': [0.001, 0.01],
    f'{model_name}__learning_rate': ['constant', 'adaptive']
}

best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = MLPClassifier(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

### 2.8 XGBoost

In [60]:
import xgboost as xgb

# Definimos el modelo a usar
classifier = xgb.XGBClassifier()
model_name = 'XGB'

# Definimos los parámetros a explorar
param_grid = {
    
    f'{model_name}__max_depth': [3, 5],
    f'{model_name}__learning_rate': [0.1, 0.3],
    f'{model_name}__subsample': [0.8, 1.0],
    f'{model_name}__colsample_bytree': [0.8, 1.0]
}

best_params = train_cv_models(model_name, classifier, param_grid, X_train, y_train)
model_best_params = {k.split('__')[1]:v for k,v in best_params.items() if 'tfidf' not in k}

best_classifier = xgb.XGBClassifier(**model_best_params)
model_stats = train_final_model(model_name, best_classifier, X_train, y_train)
results.append(model_stats)

## 3. Results

In [None]:
res_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1_score'])
res_df