### 0. Libraries and utilities

In [1]:
import re
import unicodedata
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ccsar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize the text
    tokens = text.split()

    # Remove English stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Join the tokens back into a single string
    text = ' '.join(tokens)

    return text

# Limpiamos el texto
preprocess_text('What can I say, I love this place')

'say love place'

### 1. Preprocessing

In [3]:
data_path = r'Data\Gungor_2018_VictorianAuthorAttribution_data-train.csv'
df = pd.read_csv(data_path, encoding='latin-1')
print(f'Shape of dataframe: {df.shape}')
df.head(5)

Shape of dataframe: (53678, 2)


Unnamed: 0,text,author
0,ou have time to listen i will give you the ent...,1
1,wish for solitude he was twenty years of age a...,1
2,and the skirt blew in perfect freedom about th...,1
3,of san and the rows of shops opposite impresse...,1
4,an hour s walk was as tiresome as three in a s...,1


In [4]:
avg_chr = np.mean(df['text'].apply(len))
f'Cantidad promedio de caracteres por texto: {avg_chr:,.2f}'

'Cantidad promedio de caracteres por texto: 4,942.97'

In [5]:
avg_chr = np.mean(df['text'].apply(lambda x: len(x.split(' '))))
f'Cantidad promedio de palabras por texto: {avg_chr:,.2f}'

'Cantidad promedio de palabras por texto: 1,001.00'

In [26]:
## Se identifica un desbalance de clases:
df['author'].value_counts()

author
8     6914
26    4441
14    2696
37    2387
45    2312
21    2307
39    2266
48    1825
33    1742
19    1543
4     1483
15    1460
43    1266
38    1163
25    1159
9     1108
18    1078
42    1022
30     972
50     914
1      912
41     911
28     823
10     755
32     703
36     693
17     660
35     659
29     645
12     627
46     605
20     587
22     495
13     485
44     468
23     455
34     453
40     430
6      407
11     383
2      382
24     380
27     306
3      213
16     183
Name: count, dtype: int64

In [6]:
## Getting sample fo 
df_sample = df.sample(5000)
n_authors = df_sample['author'].nunique()
print(f'Authors in df_sample: {n_authors}')
df_sample.head(5)

Authors in df_sample: 45


Unnamed: 0,text,author
26696,and is i really believe truly amiable be in fr...,26
47638,me what a very fitting coat â see how it over ...,44
37703,his soul and the movements of his arm indicate...,36
27660,perfection sir addressing herself then to dr g...,26
13606,in made he had worn his black hair long and hi...,13


### 2. Define baseline with TF-IDF

In [10]:
# Importamos librerias
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

X = df['text']
y = df['author']

# Dividimos los datos en entrenamiento y testeo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Creacion del pipeline
model = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(max_features=1000, preprocessor=preprocess_text)),
    ('oversample', RandomOverSampler()),
    ("logit", LogisticRegression(multi_class='multinomial', solver='lbfgs'))
])

In [22]:
# Ajustamos el modelo
model.fit(X_train, y_train)



In [23]:
# Medimos el accuracy del modelo
model.score(X_test, y_test)

0.793

In [24]:
# Hacer predicciones
y_pred = model.predict(X_test)

In [25]:
# Evaluar el modelo
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.93      0.93      0.93        28
           2       0.60      0.50      0.55         6
           3       0.60      0.60      0.60         5
           4       0.48      0.62      0.55        24
           6       0.33      0.14      0.20         7
           8       0.92      0.82      0.87       131
           9       0.85      0.73      0.79        15
          10       0.84      0.80      0.82        20
          11       0.75      1.00      0.86         3
          12       0.67      0.67      0.67         6
          13       0.70      0.88      0.78         8
          14       0.83      0.71      0.77        63
          15       0.78      0.83      0.81        35
          16       0.00      0.00      0.00         1
          17       0.33      0.30      0.32        10
          18       0.82      0.70      0.76        20
          19       0.76      0.69      0.72        32
          20       0.82    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.model_selection import GridSearchCV

# Definimos los parámetros a explorar
param_grid = {
    'tfidf__max_features': [500, 1000, 2000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'logit__C': [0.1, 1, 10],
    'logit__penalty': ['l1', 'l2']
}

# Creamos el objeto GridSearchCV
grid_search = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=5,
                          verbose=2)

# Ajustamos el modelo a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtenemos los mejores parámetros
best_params = grid_search.best_params_
print(best_params)