### 0. Libraries and utilities

In [1]:
import re
import unicodedata
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ccsar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize the text
    tokens = text.split()

    # Remove English stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Join the tokens back into a single string
    text = ' '.join(tokens)

    return text

# Limpiamos el texto
preprocess_text('What can I say, I love this place')

'say love place'

### 1. Preprocessing

In [3]:
# data_path = r'Data\Gungor_2018_VictorianAuthorAttribution_data-train.csv'
# df = pd.read_csv(data_path, encoding='latin-1')

## URL from github repo, load as dataframe
url = 'https://raw.githubusercontent.com/ccsarmientot/text_author_classifier/master/datasets/sample_victorian.parquet'
url = 'datasets/sample_victorian.parquet'
df = pd.read_parquet(url)

print(f'Shape of dataframe: {df.shape}')
df.head(5)

Shape of dataframe: (10000, 2)


Unnamed: 0,text,author
28172,now when nobody else was to be had and no high...,26
4098,said to me john you was always honorable and i...,8
21493,not see the lady s face until the marriage day...,20
16864,so you have come at last yes here i am and how...,15
2727,night what jack you be a soldier yes if you th...,4


In [4]:
avg_chr = np.mean(df['text'].apply(len))
f'Cantidad promedio de caracteres por texto: {avg_chr:,.2f}'

'Cantidad promedio de caracteres por texto: 4,945.23'

In [5]:
avg_chr = np.mean(df['text'].apply(lambda x: len(x.split(' '))))
f'Cantidad promedio de palabras por texto: {avg_chr:,.2f}'

'Cantidad promedio de palabras por texto: 1,001.00'

In [6]:
## Se identifica un desbalance de clases:
df['author'].value_counts()

author
8     1336
26     869
14     501
21     448
37     441
39     434
45     424
33     347
48     342
19     330
15     244
4      240
43     228
9      213
38     209
25     207
18     200
30     182
42     178
1      174
50     162
41     159
32     139
10     138
28     132
17     128
36     120
35     118
12     112
44     110
20     110
46     107
13      92
29      89
34      85
22      85
24      82
23      80
40      78
11      66
27      63
6       63
2       62
3       41
16      32
Name: count, dtype: int64

In [7]:
## Getting sample fo 
df_sample = df.sample(10_000)
n_authors = df_sample['author'].nunique()
print(f'Authors in df_sample: {n_authors}')
df_sample.head(5)

Authors in df_sample: 45


Unnamed: 0,text,author
10249,upon the fire and her chin upon her hand do yo...,8
24090,the peace were sent to sweep them back â to se...,21
15166,the past tense said at last yet i am rather en...,14
21295,often difficult to tell what the architecture ...,20
36697,surrender its com of arc seeing that she had n...,34


## 2. Modelling

In [8]:
# Importamos librerias
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

X = df['text']
y = df['author']

# Dividimos los datos en entrenamiento y testeo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 2.1 Logistic Regression

In [9]:
# Creacion del pipeline del modelo inicial
model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("logit", LogisticRegression())
])

#### 2.1.1 Primera iteración grid search

In [None]:
from sklearn.model_selection import GridSearchCV

# Definimos los parámetros a explorar
param_grid = {
    'tfidf__max_features': [500, 1000],
    'logit__C': [0.1, 1, 10],
}

# Creamos el objeto GridSearchCV
grid_search = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=5,
                          verbose=2)

# Ajustamos el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtenemos los mejores parámetros
best_params = grid_search.best_params_
print(best_params)

Fitting 5 folds for each of 6 candidates, totalling 30 fits




[CV] END ..............logit__C=0.1, tfidf__max_features=500; total time=  56.9s




#### 2.1.2 Segunda iteración grid search

In [13]:
# Definimos los parámetros a explorar
param_grid = {
    'tfidf__max_features': [500, 1000],
    'logit__C': [0.1, 1, 10],
}

# Creamos el objeto GridSearchCV
grid_search = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=5,
                          verbose=2)

# Ajustamos el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtenemos los mejores parámetros
best_params = grid_search.best_params_
print(best_params)

#### 2.1.3 Modelo final

In [None]:
# Creacion del pipeline
final_model = Pipeline(steps=[
    ## Selección de los features
    ("tfidf", TfidfVectorizer(max_features=500, preprocessor=preprocess_text)),
    ## Se tiene un desbalance de clases, entonces se hace un oversampling
    ('oversample', RandomOverSampler()),
    ## Se aplica el modelo
    ("logit", LogisticRegression(C=0.1))
])

In [15]:
# Ajustamos el modelo
final_model.fit(X_train, y_train)

In [None]:
# Medimos el accuracy del modelo
final_model.score(X_test, y_test)