In [1]:
# Daniel Bandala @ nov 2022
import numpy as np
import nltk
import glob
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.vq import whiten

In [2]:
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

In [3]:
#Cargar datos
files = ["elaleph.txt","elevangeliosegunmarcos.txt","cartaaunasenoritaenparis.txt","casatomada.txt"] # "articulocientifico.txt"
texts = []
for fn in files:
    with open(fn) as f:
        texts.append(f.read())
all_text = ' '.join(texts)
print(files)

['elaleph.txt', 'elevangeliosegunmarcos.txt', 'cartaaunasenoritaenparis.txt', 'casatomada.txt']


In [4]:
def PredictAuthors(fvs):
    """
    Use k-means clustering to fit a model
    """
    km = KMeans(n_clusters=2, init='k-means++', n_init=10, verbose=0)
    km.fit(fvs)
    return km

In [5]:
def LexicalFeatures():
    """
    Compute feature vectors for word and punctuation features
    """
    num_texts = len(texts)
    fvs = np.zeros((len(texts), 9), np.float64)
    for e, ch_text in enumerate(texts):
        contMayus = 0
        contMinus = 0
        contNumeros = 0
        textoNormal = ch_text
        textoMinusculas = ch_text.lower()
        charTotales = len(textoNormal)
        texto = ch_text.lower()
        tokens = nltk.word_tokenize(texto)
        words = word_tokenizer.tokenize(texto)
        sentences = sentence_tokenizer.tokenize(ch_text)
        vocab = set(words)
        words_per_sentence = np.array([len(word_tokenizer.tokenize(s))
                                       for s in sentences])
        for letra in textoNormal:
            if letra.isupper():
                contMayus = contMayus + 1
            else:
                contMinus = contMinus + 1
            if letra in ["1","2","3","4","5","6","7","8","9","0"]:
                contNumeros = contNumeros + 1
        # Número promedio de palabras por oración
        fvs[e, 0] = words_per_sentence.mean()
        # Variación del tamaño de las oraciones
        fvs[e, 1] = words_per_sentence.std()
        # Diversidad léxica
        fvs[e, 2] = len(vocab) / float(len(words))
        # Número de comas por oración
        fvs[e, 3] = tokens.count(',') / float(len(sentences))
        # Número de puntos y comas por oración
        fvs[e, 4] = tokens.count(';') / float(len(sentences))
        # Número de dos puntos por oración
        fvs[e, 5] = tokens.count(':') / float(len(sentences))
        # Proporción mayúsculas/total de caracteres
        fvs[e, 6] = float(contMayus/charTotales)
        # Proporción minúsculas/total de caracteres
        fvs[e, 7] = float(contMinus/charTotales)
        # Proporción de números respecto a letras
        fvs[e, 8] = float(contNumeros/charTotales)

    fvs = whiten(fvs)
    return fvs

In [6]:
classifications = []
feature_sets = []
for element in list(LexicalFeatures()):
    feature_sets.append(element)
feature_sets = np.array(feature_sets)

In [7]:
print(feature_sets)

[[  6.51711707   4.06379989  19.84778726   3.70450877   3.16848278
    2.99193192   5.6156501  306.66533696   2.6671317 ]
 [  4.60522708   1.5523925   20.11183412   1.78726301   1.55716034
    2.10969558   5.40041235 306.88057471   0.52763767]
 [  6.90950291   2.65061009  17.74780277   3.31483432   0.55455295
    0.69820364   3.23727713 309.04370993   0.        ]
 [  4.87904394   1.70369342  18.30923726   1.32462143   0.92585536
    0.60526095   3.9031455  308.37784156   0.94835694]]


In [8]:
result = PredictAuthors(feature_sets).labels_
print("Prediction: ", result)

Prediction:  [1 0 0 0]


#### Al eliminar el articulo cientifico de los documentos, la clasificación no se realiza de manera correcta, puesto que únicamente un solo documento lo esta clasificando con el autor Jorge Luis Borges. Por tanto, se agregan más características estilométricas para facilitar el trabajo del clasificador. Se agregan 6 características sintácticas y 10 características relacionadas a la bolsa de palabras de todos los documentos, resultando en un total de 25 características estilométricas.

In [9]:
def SyntacticFeatures():
    """
    Extract feature vector for part of speech frequencies
    """
    def token_to_pos(ch):
        tokens = nltk.word_tokenize(ch)
        return [p[1] for p in nltk.pos_tag(tokens)]

    chapters_pos = [token_to_pos(ch) for ch in texts]
    pos_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS']
    fvs_syntax = np.array([[ch.count(pos) for pos in pos_list]
                           for ch in chapters_pos]).astype(np.float64)

    # normalise by dividing each row by number of tokens in the chapter
    fvs_syntax /= np.c_[np.array([len(ch) for ch in chapters_pos])]

    return fvs_syntax

In [10]:
def BagOfWords():
    """
    Compute the bag of words feature vectors, based on the most common words
     in the whole book
    """
    # get most common words in the whole book
    NUM_TOP_WORDS = 10
    all_tokens = nltk.word_tokenize(all_text)
    fdist = nltk.FreqDist(all_tokens)    
    vocab = list(fdist.keys())[:NUM_TOP_WORDS]

    # use sklearn to create the bag for words feature vector for each chapter
    vectorizer = CountVectorizer(vocabulary=vocab, tokenizer=nltk.word_tokenize)
    fvs_bow = vectorizer.fit_transform(texts).toarray().astype(np.float64)

    # normalise by dividing each row by its Euclidean norm
    fvs_bow /= np.c_[np.apply_along_axis(np.linalg.norm, 1, fvs_bow)]

    return fvs_bow

In [11]:
# get features
lexical = LexicalFeatures()
syntactic = SyntacticFeatures()
word_bag = BagOfWords()



In [12]:
# concatenate arrays
feature_sets = np.concatenate((lexical, syntactic, word_bag), axis=1)

In [13]:
print(feature_sets)

[[6.51711707e+00 4.06379989e+00 1.98477873e+01 3.70450877e+00
  3.16848278e+00 2.99193192e+00 5.61565010e+00 3.06665337e+02
  2.66713170e+00 3.43557060e-01 8.04620594e-02 2.33021310e-02
  3.06711810e-02 1.06751643e-01 2.54929297e-02 0.00000000e+00
  3.35780389e-03 3.35780389e-03 8.15946345e-01 3.35780389e-03
  3.82789643e-01 4.33156701e-01 0.00000000e+00 0.00000000e+00
  6.71560777e-03]
 [4.60522708e+00 1.55239250e+00 2.01118341e+01 1.78726301e+00
  1.55716034e+00 2.10969558e+00 5.40041235e+00 3.06880575e+02
  5.27637671e-01 4.17888563e-01 8.06451613e-02 3.27468231e-02
  3.32355816e-02 1.07526882e-01 2.59042033e-02 0.00000000e+00
  0.00000000e+00 8.58788400e-03 6.44091300e-01 0.00000000e+00
  3.69279012e-01 6.69854952e-01 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [6.90950291e+00 2.65061009e+00 1.77478028e+01 3.31483432e+00
  5.54552951e-01 6.98203645e-01 3.23727713e+00 3.09043710e+02
  0.00000000e+00 4.04733728e-01 4.52662722e-02 3.07692308e-02
  3.99408284e-02 1.19822485e-01 2.

In [14]:
result = PredictAuthors(feature_sets).labels_
print("Prediction: ", result)

Prediction:  [0 1 1 1]
