# Fases de entrenamiento de una red neuronal

Importamos los datos de un corpus en español.

Corpus de español:
- AnCora | Github: https://github.com/UniversalDependencies/UD_Spanish-AnCora

- usamos el conllu parser para leer el corpus: https://pypi.org/project/conllu/ . Es muy usado, se puede trabajar con pip install conllu

- Etiquetas Universal POS (Documentación): https://universaldependencies.org/u/pos/ . Hay referencias universales para clasificar palabras


In [1]:
!pip install conllu
!git clone https://github.com/UniversalDependencies/UD_Spanish-AnCora.git

Collecting conllu
  Downloading conllu-4.4-py2.py3-none-any.whl (15 kB)
Installing collected packages: conllu
Successfully installed conllu-4.4
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m
fatal: destination path 'UD_Spanish-AnCora' already exists and is not an empty directory.


In [2]:
#leer los datos que vienen de un archivo conllu
from conllu import parse_incr
wordlist = []#creo una lista vacía
data_file = open("UD_Spanish-AnCora/es_ancora-ud-dev.conllu", "r", encoding="utf-8") #abro el archivo que descargué al clonar el repositorio, agrego permisos de lectura con "r"
for tokenlist in parse_incr(data_file):
    print(tokenlist.serialize())#quiero ver los tokens
    #serialize es un atributo propio del formato conllu, sirve para poder ver bien los tokens


10	son	ser	AUX	AUX	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	11	cop	_	_
11	conceptos	concepto	NOUN	NOUN	Gender=Masc|Number=Plur	0	root	_	_
12	compatibles	compatible	ADJ	ADJ	Number=Plur	11	amod	_	SpaceAfter=No
13	?	?	PUNCT	PUNCT	PunctSide=Fin|PunctType=Qest	11	punct	_	_
14	-	-	PUNCT	PUNCT	PunctType=Dash	11	punct	_	_
15	-	-	PUNCT	PUNCT	PunctType=Dash	11	punct	_	SpaceAfter=No
16	.	.	PUNCT	PUNCT	PunctType=Peri	11	punct	_	_


# sent_id = dev-s1043
# text = Depende de cada adulto.
1	Depende	depender	VERB	VERB	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	_
2	de	de	ADP	ADP	AdpType=Prep	4	case	_	_
3	cada	cada	DET	DET	Number=Sing|PronType=Tot	4	det	_	_
4	adulto	adulto	NOUN	NOUN	Gender=Masc|Number=Sing	1	obj	_	SpaceAfter=No
5	.	.	PUNCT	PUNCT	PunctType=Peri	1	punct	_	_


# sent_id = dev-s1044
# text = A mí no me interesa hacer cosas ya hechas.
1	A	a	ADP	ADP	AdpType=Prep	2	case	_	_
2	mí	yo	PRON	PRON	Case=Acc|Number=Sing|Person=1|PrepCase=Pre|PronType=Prs	5	obj	_	_
3	no

Veamos la estructura de los tokens

In [3]:
print(tokenlist)
print(tokenlist[1])

TokenList<Lo, cierto, es, que, a, mí, ,, me, da, un, poco, de, pena, .>
cierto


In [4]:
tokenlist[1]['form']+'|'+tokenlist[1]['upos']# el atributo form es una palabra, upos es la categoría gramatical.

'cierto|ADJ'

In [5]:
for i in range(10):
    print(tokenlist[i]['form']+'|'+tokenlist[i]['upos'])

Lo|PRON
cierto|ADJ
es|AUX
que|SCONJ
a|ADP
mí|PRON
,|PUNCT
me|PRON
da|VERB
un|DET


# Entrenamiento del modelo- Calculo de conteos
La primera etapa es el cálculo de conteos

- tags(tags) ``tagCountDict``: $C(tag)$
- emisiones(word|tag) ``emissionProbDict``: $C(word|tag)$
- transiciones(tag|prevtag) ``transitionDict``: $C(tag|prevtag)$

In [23]:
#tenemos tres diccionarios inicialmente vacíos

tagCountDict = {}
emissionDict = {}
transitionDict = {}

tagtype = 'upos'
data_file = open("UD_Spanish-AnCora/es_ancora-ud-dev.conllu", "r", encoding="utf-8") #abro el archivo que descargué al clonar el repositorio, agrego permisos de lectura con "r"
for tokenlist in parse_incr(data_file):
    prevtag=None
    for token in tokenlist:
        #C(tag) el conteo de las etiquetas
        tag = token[tagtype]
        if tag in tagCountDict.keys():
            tagCountDict[tag] +=1
        else:
            tagCountDict[tag] = 1

        #C(WORD|tag) -> probabilidades de emisión
        wordtag = token['form'].lower()+'|'+token[tagtype] #(palabra|etiqueta)
        if wordtag in emissionProbDict.keys():
            emissionDict[wordtag] +=1
        else:
            emissionDict[wordtag] = 1

        #Ahora vamos a hacer un conteo para las probabilidades de transición
        #C(tag|tag_previo)
        
        if prevtag is None:
            prevtag=tag
            continue
        transitiontags = tag+'|'+prevtag
        if transitiontags in transitionDict.keys():
            transitionDict[transitiontags] = transitionDict[transitiontags] + 1
        else:
            transitionDict[transitiontags] = 1
        prevtag = tag    


# Cálculo de probabilidades

- Probabilidades de transición
$$
P(tag|prevtag)=\frac{C(prevtag,tag)}{C(prevtag)}
$$


- Probabilidades de emisión

$$
P(word|tag)=\frac{C(word,tag)}{C(tag)}
$$

In [25]:
transitionProbDict = {} # matriz A
emissionProbDict = {} # matriz B

# transition Probabilities 
for key in transitionDict.keys():
    tag, prevtag = key.split('|')
    if tagCountDict[prevtag]>0:
        transitionProbDict[key] = transitionDict[key]/(tagCountDict[prevtag])
    else:
        print(key)

# emission Probabilities 
for key in emissionDict.keys():
    word, tag = key.split('|')
    if emissionDict[key]>0:
        emissionProbDict[key] = emissionDict[key]/tagCountDict[tag]
    else:
        print(key)

transitionProbDict


{'NOUN|DET': 0.734186313973548,
 'PUNCT|NOUN': 0.21731430357328888,
 'ADP|PUNCT': 0.11054475947754062,
 'ADJ|ADP': 0.020764899108399813,
 'NOUN|ADJ': 0.21553672316384181,
 'ADP|NOUN': 0.38295655797478906,
 'SCONJ|ADP': 0.017832003754106054,
 'VERB|SCONJ': 0.15933098591549297,
 'VERB|VERB': 0.028910802330793368,
 'NUM|VERB': 0.019497982967279247,
 'NOUN|NUM': 0.5508571428571428,
 'NOUN|ADP': 0.26079305490380106,
 'VERB|ADP': 0.05983106522759268,
 'ADP|VERB': 0.27319587628865977,
 'DET|NOUN': 0.014063964996353788,
 'CCONJ|NOUN': 0.049588498801958536,
 'ADJ|CCONJ': 0.08459422283356259,
 'ADP|ADJ': 0.27937853107344635,
 'DET|ADP': 0.38714218676677614,
 'ADJ|NOUN': 0.17033024273361808,
 'PUNCT|ADJ': 0.2480225988700565,
 'VERB|PUNCT': 0.07629818413507486,
 'PRON|DET': 0.018545140885566417,
 'DET|PRON': 0.03709810387469085,
 'PROPN|DET': 0.09272570442783208,
 'PROPN|PROPN': 0.22875494071146246,
 'ADP|PROPN': 0.14303359683794467,
 'PROPN|ADP': 0.1516893477240732,
 'PUNCT|PROPN': 0.367341897233

In [26]:
emissionProbDict

{'el|DET': 0.0001437607820586544,
 'gobernante|NOUN': 0.00010417751849150954,
 ',|PUNCT': 0.00015928639694170118,
 'con|ADP': 0.00011731581417175035,
 'ganada|ADJ': 0.0002824858757062147,
 'fama|NOUN': 0.00010417751849150954,
 'desde|ADP': 0.00011731581417175035,
 'que|SCONJ': 0.0008802816901408451,
 'llegó|VERB': 0.00022411474675033618,
 'hace|VERB': 0.00022411474675033618,
 '16|NUM': 0.001142857142857143,
 'meses|NOUN': 0.00010417751849150954,
 'al|ADP': 0.00011731581417175035,
 'poder|NOUN': 0.00010417751849150954,
 'de|ADP': 0.00011731581417175035,
 'explotar|VERB': 0.00022411474675033618,
 'máximo|NOUN': 0.00010417751849150954,
 'su|DET': 0.0001437607820586544,
 'oratoria|NOUN': 0.00010417751849150954,
 'y|CCONJ': 0.000687757909215956,
 'acusado|ADJ': 0.0002824858757062147,
 'por|ADP': 0.00011731581417175035,
 'sus|DET': 0.0001437607820586544,
 'detractores|NOUN': 0.00010417751849150954,
 'incontinencia|NOUN': 0.00010417751849150954,
 'verbal|ADJ': 0.0002824858757062147,
 'enmudec

In [27]:
#Ya tenemos el modelo, ahora tenemos que guardarlo
import numpy as np
np.save('transitionHMM.npy', transitionProbDict)
np.save('emissionHMM.npy', emissionProbDict)
transitionProbdict = np.load('transitionHMM.npy', allow_pickle='TRUE').item()
transitionProbDict['ADJ|ADJ']

0.030225988700564973

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=31a57299-7db0-417b-9a80-6e17fdb92497' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>