In [1]:
%%time
import os
import codecs
import pandas as pd

from ml_tps.utils.text_processing import no_unique_words, no_words_with_word_part, most_frequent_words, word_frequency

from ml_tps.algorithms.k_means import KMeans
from ml_tps.algorithms.hierarchical_clustering import HierarchicalClustering
from ml_tps.algorithms.kohonen_net import KohonenNet

from ml_tps.utils.evaluation import getConfusionMatrix, computeAccuracy

print("Dependencies imported.")

In [2]:
%%time

conj_subordinantes = ["porque", "pues", "ya", "puesto que", "a causa de", "debido a",
                      "luego", "conque", "así que",
                      "si",
                      "para que", "a fin de que",
                      "como", "que",
                      "aunque", "aun cuando", "si bien",
                      "que", "si"]
conj_coordinantes = ["ni", "y", "o", "o bien", "pero aunque", "no obstante", "sin embargo", "sino", "por el contrario"]
art_determinados = ["la", "el", "los", "las"]
art_indeterminados = ["un", "una", "unos", "unas"]

dir_path = os.path.dirname(os.path.realpath("e2.ipynb"))
filepath = f"{dir_path}/../tp4/data/"

data = []
i = 1
for author in ["Calderaro", "Fonteveccia", "Pagni", "Vanderkooy", "Verbitsky"]:
    for i in range(1, 11):
        extracted_text = codecs.open((filepath + author + "/" + author + str(i) + ".txt"),
                                     encoding="latin-1").read().lower()
        # extracted_words = extract_words_from_text(extracted_text, prevent_uppercase_duplicates=True)
        extracted_data = [author,
                          most_frequent_words(extracted_text, no_words=5, normalize=True).sum(),
                          no_unique_words(extracted_text, normalize=True),
                          word_frequency(extracted_text, list_of_words=conj_subordinantes, normalize=True, average=True),
                          word_frequency(extracted_text, list_of_words=conj_coordinantes, normalize=True, average=True),
                          word_frequency(extracted_text, list_of_words=art_determinados, normalize=True, average=True),
                          word_frequency(extracted_text, list_of_words=art_indeterminados, normalize=True, average=True),
                          no_words_with_word_part(extracted_text, word_part="mente", mode="ending", normalize=True)]
        data.append(extracted_data)

data = pd.DataFrame(data, columns=["Autor",
                                   "5 palabras más usadas",
                                   "Palabras distintas",
                                   "Conjunciones subord.",
                                   "Conjunciones coordinantes",
                                   "Artículos determinados",
                                   "Artículos indeterminados",
                                   "Adverbios terminados en -mente"])   # always using relative frequencies

X = data.drop(["Autor", "5 palabras más usadas", "Artículos determinados", "Artículos indeterminados"], axis=1)
y = data["Autor"]
data.head()

## K-medias

In [6]:
%%time
k_means = KMeans()
k_means.fit(X, k=5)
k_means_predictions = k_means.predict(X)
conf_matrix_kmeans = getConfusionMatrix(predictions=k_means_predictions, actual=y)
k_means.plot(X, k_means_predictions)

conf_matrix_kmeans


## Agrupación Jerárquica

In [7]:
%%time
hc = HierarchicalClustering()
hc.fit(X, max_no_clusters=5)
hc_predictions = hc.predict(X)
conf_matrix_hc = getConfusionMatrix(predictions=hc_predictions, actual=y)

hc.plot(X, hc_predictions)
hc.plot_dendrogram(color_threshold=0.03)

conf_matrix_hc

## Red de Kohonen

In [5]:
%%time
kohonen = KohonenNet()
kohonen.fit(X, side=4, min_eta = 0.1, alpha = 0.5)
kohonen.plot(pd.concat([y, X], axis=1), objective="Autor")