<a href="https://colab.research.google.com/github/acponce2023/a2/blob/main/filtering_process.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

En este cuaderno se encuentra una adaptación del proceso de filtrado para que podáis trabajar con él y ver su funcionamiento más en detalle.

Hay algunos ficheros que tendréis que poner en las carpetas que corresponda, por ejemplo todos los ficheros de frecuencias.


In [None]:
!pip install spacy lingua-language-detector


In [None]:
!tar -xvf freq.tar
!tar -xvf extract.tar

In [None]:
!spacy download en_core_web_sm
!spacy download de_core_news_sm
!spacy download fr_core_news_sm
!spacy download it_core_news_sm
!spacy download es_core_news_sm

In [None]:
import time
import traceback
import uuid
import os
import spacy
import json

import lingua

# Spacy

spacy_models = json.loads('''{
	"bul": {"sm": null, "lg": null},
	"hrv": {"sm": "hr_core_news_sm", "lg": "hr_core_news_lg"},
	"cze": {"sm": null, "lg": null},
	"dan": {"sm": "da_core_news_sm", "lg": "da_core_news_trf"},
	"dut": {"sm": "nl_core_news_sm", "lg": "nl_core_news_lg"},
	"eng": {"sm": "en_core_web_sm", "lg": "en_core_web_trf"},
	"est": {"sm": null, "lg": null},
	"fin": {"sm": "fi_core_news_sm", "lg": "fi_core_news_lg"},
	"fre": {"sm": "fr_core_news_sm", "lg": "fr_dep_news_trf"},
	"ger": {"sm": "de_core_news_sm", "lg": "de_dep_news_trf"},
	"gre": {"sm": "el_core_news_sm", "lg": "el_core_news_lg"},
	"hun": {"sm": null, "lg": null},
	"gle": {"sm": null, "lg": null},
	"ita": {"sm": "it_core_news_sm", "lg": "it_core_news_lg"},
	"lav": {"sm": null, "lg": null},
	"lit": {"sm": "lt_core_news_sm", "lg": "lt_core_news_lg"},
	"mlt": {"sm": null, "lg": null},
	"pol": {"sm": "pl_core_news_sm", "lg": "pl_core_news_lg"},
	"por": {"sm": "pt_core_news_sm", "lg": "pt_core_news_lg"},
	"rum": {"sm": "ro_core_news_sm", "lg": "ro_core_news_lg"},
	"rus": {"sm": "ru_core_news_sm", "lg": "ru_core_news_lg"},
	"slo": {"sm": null, "lg": null},
	"slv": {"sm": null, "lg": null},
	"spa": {"sm": "es_core_news_sm", "lg": "es_dep_news_trf"},
	"swe": {"sm": null, "lg": null}
}''')



freq_list = {"spa": {}, "ger": {}, "fre": {}, "eng": {}, "ita": {}}

freq_list["spa"]["1-gram"] = open("./freq/spa/spa-1gram.txt").read().split("\n")
freq_list["ger"]["1-gram"] = open("./freq/ger/ger-1gram.txt").read().split("\n")
freq_list["fre"]["1-gram"] = open("./freq/fre/fre-1gram.txt").read().split("\n")
freq_list["eng"]["1-gram"] = open("./freq/eng/eng-1gram.txt").read().split("\n")
freq_list["ita"]["1-gram"] = open("./freq/ita/ita-1gram.txt").read().split("\n")

freq_list["spa"]["2-gram"] = open("./freq/spa/spa-2gram.txt").read().split("\n")
freq_list["ger"]["2-gram"] = open("./freq/ger/ger-2gram.txt").read().split("\n")
freq_list["fre"]["2-gram"] = open("./freq/fre/fre-2gram.txt").read().split("\n")
freq_list["eng"]["2-gram"] = open("./freq/eng/eng-2gram.txt").read().split("\n")
freq_list["ita"]["2-gram"] = open("./freq/ita/ita-2gram.txt").read().split("\n")

lingua_langs = {
		"eng": lingua.Language.ENGLISH,
		"spa": lingua.Language.SPANISH,
		"ita": lingua.Language.ITALIAN,
		"ger": lingua.Language.GERMAN,
		"fre": lingua.Language.FRENCH
	}

langs_used = [v for k,v in lingua_langs.items()]

lang_detector = lingua.LanguageDetectorBuilder.from_languages(*langs_used).build()

In [None]:
def filter_terms(lines, lang):

	terms = {}

	filter_deep_1g = 50000
	filter_deep_2g = 1000000

	dict_1g = {}
	dict_2g = {}


	if (lang in freq_list) and ("1-gram" in freq_list[lang]):

		lower_list = [t.lower() for t in freq_list[lang]["1-gram"][:filter_deep_1g]]

		dict_1g = dict(zip(lower_list, range(len(lower_list))))

	if (lang in freq_list) and ("2-gram" in freq_list[lang]):

		lower_list = [t.lower() for t in freq_list[lang]["2-gram"][:filter_deep_2g]]

		dict_2g = dict(zip(lower_list, range(len(lower_list))))


	for term in lines:

		freq, term = term.replace("\n", "").split("\t")

		term = term.replace("-", " ").replace("  ", " ")

		if (lang in freq_list) and ("1-gram" in freq_list[lang]) and (term.lower() in dict_1g):

			print("Excluding", term, "(too freq 1-gram)")


		elif (lang in freq_list) and ("2-gram" in freq_list[lang]) and (term.lower() in dict_2g):

			print("Excluding", term, "(too freq 2-gram)")


		elif any(len(word) < 4 for word in term.split(" ")):

			print("Excluding", term, "(too short)")


		elif not term.replace(" ", "").replace("'", "").replace("-","").isalpha() or term.replace(" ", "").startswith("-") or term.replace(" ", "").endswith("-"):

			print("Excluding", term, "(strange symbols)")

		else:

			print("Adding", term)

			terms[term] = {"f": freq}


	# Las diferencias de capitalizacion se resuelven optando por la version mas habitual

	for term, obj in terms.copy().items():

		if term.lower() != term and term.lower() in terms:

			if int(terms[term.lower()]["f"]) > int(terms[term]["f"]):

				terms.pop(term)

				print("Excluding", term, "(duplicated and less frequent capitalization)")

			else:

				terms.pop(term.lower())

				print("Excluding", term.lower(), "(duplicated and less frequent capitalization)")


	valid_NE = ["EVENT", "FAC", "ORG", "WORK_OF_ART"]

	pipe = spacy.load(spacy_models[lang]["sm"])

	for term, obj in terms.copy().items():

		doc = pipe(term)

		for token in doc.ents:

			print("Found NE: ", token.text, token.label_)

			if not (token.label_ in valid_NE) and term in terms:

				terms.pop(term)


	for term, obj in terms.copy().items():

		detected = lang_detector.detect_language_of(term)

		print(term, detected)

		if lang in lingua_langs and detected != lingua_langs[lang]:

			terms.pop(term)


	return terms

In [None]:
def lemmatize_terms(terms, lang):

	lemmatized_terms = {}

	pipe = spacy.load(spacy_models[lang]["sm"])

	term_list_old = list(terms.keys())

	for term in term_list_old:

		doc = pipe(term)

		full_token = []

		for token in doc:

			full_token.append(token.lemma_)

		lemma = " ".join(full_token)

		if term in terms:

			old_f = terms[term]

			if lemma in lemmatized_terms:

				current_f = lemmatized_terms[lemma]

				new_f = current_f["f"] + old_f["f"] # Varias palabras convergen en una raíz

				lemmatized_terms[lemma] = {"f": new_f}

			else:

				lemmatized_terms[lemma] = {"f": old_f["f"]}

	# Se reaplica filtrado a las palabras luego de filtrarlas, esto estaría
  # mejor hacerlo de otra forma, hay código repetido

	filter_deep_1g = 25000
	filter_deep_2g = 1000000

	dict_1g = {}
	dict_2g = {}

	if (lang in freq_list) and ("1-gram" in freq_list[lang]):

		lower_list = [t.lower() for t in freq_list[lang]["1-gram"][:filter_deep_1g]]

		dict_1g = dict(zip(lower_list, range(len(lower_list))))

	if (lang in freq_list) and ("2-gram" in freq_list[lang]):

		lower_list = [t.lower() for t in freq_list[lang]["2-gram"][:filter_deep_2g]]

		dict_2g = dict(zip(lower_list, range(len(lower_list))))


	for term in lemmatized_terms.copy().keys():

    # Solo se extá empleando en "eng" porque la lematización en otras cambia
    # también otras flexiones y es algo a evitar.

		if lang == "eng":

			if (term.lower() in dict_1g):

				print("Excluding", term, "(too freq 1-gram) lemma")

				lemmatized_terms.pop(term)

			elif (term.lower() in dict_2g):

				print("Excluding", term, "(too freq 2-gram) lemma")

				lemmatized_terms.pop(term)

	return lemmatized_terms

In [None]:
example = open("./extract/eng/occupational_therapy/terms.txt").readlines()

terms = filter_terms(example, "eng")

print(terms)

terms = lemmatize_terms(terms, "eng")

print(terms)