<a href="https://colab.research.google.com/github/castillosebastian/NLU_legal_domain/blob/master/NLU_Semantica_Vectoria_II.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Semantica Vectorial para procesar sentencias

In [2]:
# Libraries, tools and thanks 
import bs4
import nltk
import numpy
import pandas as pd
import io
import requests
from google.colab import drive
import numpy as np 
import json

In [None]:
# procesar con GPU

In [3]:
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

/content/drive
Mounted at /content/drive


In [4]:
%cd  'drive/My Drive/Colab Notebooks/data'

/content/drive/My Drive/Colab Notebooks/data


In [5]:
%ls

corpus_fallosmetdat.json  tbdoctrina.json  tbmetdat.json
corpus_textosfallos.json  tbfallos.json


# Lectura de tablas de datos primarios

In [6]:
with open('tbfallos.json', 'r') as myfile:
    data1=myfile.read()
# parse file
tbfallos = json.loads(data1)
tbfallos = pd.DataFrame.from_dict(tbfallos)

In [None]:
tbfallos.head()

# Instalar librería STANZA (Stanford NLP)

In [None]:
! pip install stanza

In [8]:
import stanza
stanza.download('es') # download spanish model

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 20.3MB/s]                    
2020-10-27 20:28:33 INFO: Downloading default packages for language: es (Spanish)...
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/es/default.zip: 100%|██████████| 583M/583M [03:03<00:00, 3.17MB/s]
2020-10-27 20:31:44 INFO: Finished downloading models and saved to /root/stanza_resources.


# Prueba de Tokenizacion de un sumario

In [9]:
fallos = tbfallos['textos_fallo'][1]
fallos = fallos.replace("#","\n\n")
#fallos = fallos.values.tolist()

In [10]:
nlp = stanza.Pipeline(lang='es', processors='tokenize,ner,mwt,pos,lemma,depparse', tokenize_no_ssplit=True)
doc = nlp(fallos)

2020-10-27 20:33:30 INFO: Loading these models for language: es (Spanish):
| Processor | Package |
-----------------------
| tokenize  | ancora  |
| mwt       | ancora  |
| pos       | ancora  |
| lemma     | ancora  |
| depparse  | ancora  |
| ner       | conll02 |

2020-10-27 20:33:30 INFO: Use device: gpu
2020-10-27 20:33:30 INFO: Loading: tokenize
2020-10-27 20:33:40 INFO: Loading: mwt
2020-10-27 20:33:40 INFO: Loading: pos
2020-10-27 20:33:41 INFO: Loading: lemma
2020-10-27 20:33:41 INFO: Loading: depparse
2020-10-27 20:33:42 INFO: Loading: ner
2020-10-27 20:33:43 INFO: Done loading processors!


In [11]:
doc.sentences[0].text

'TEXTO COMPLETO'

In [None]:
for sent in doc.sentences:
    print(sent.text)

In [None]:
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')

In [None]:
dicts = doc.to_dict() # dicts is List[List[Dict]], representing each token / word in each sentence in the document

# Prueba de manipulacion de objeto "doc" sobre un sumario

In [None]:
def print_doc_info(doc):
    print(f"Num sentences:\t{len(doc.sentences)}")
    print(f"Num tokens:\t{doc.num_tokens}")
    print(f"Num words:\t{doc.num_words}")
    print(f"Num entities:\t{len(doc.entities)}")

In [None]:
print_doc_info(doc)

Num sentences:	47
Num tokens:	2008
Num words:	2013
Num entities:	92


In [None]:
def word_info_df(doc):
    """
    - Parameters: doc (a Stanza Document object)
    - Returns: A Pandas DataFrame object with one row for each token in
      doc, and columns for text, lemma, upos, and xpos.
    """
    rows = []
    for sentence in doc.sentences:
        for word in sentence.words:
            row = {
                "text": word.text,
                "lemma": word.lemma,
                "upos": word.upos,
                "xpos": word.xpos,
            }
            rows.append(row)
    return pd.DataFrame(rows)

In [None]:
word_info_df(doc)

In [None]:
doc.num_words

2013

In [None]:
doc.num_tokens

2008

# Prueba sobre entidades de un sumario

In [None]:
print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in doc.ents], sep='\n')[1]

In [15]:
# select person entities
def select_person_entities(doc):
    return [ent for ent in doc.entities if ent.type == "PER"]

In [16]:
def person_df(doc):
    """
    - Parameters: doc (a Stanza Document object)
    - Returns: A Pandas DataFrame with one row for each entity in doc
      that has a "PERSON" type, and and columns text, type, start_char.
    """
    rows = []
    persons = select_person_entities(doc)
    for person in persons:
        row = {
            "text": person.text,
            "type": person.type,
            "start_char": person.start_char,
            "end_char": person.end_char
        }
        rows.append(row)
    return pd.DataFrame(rows)


In [None]:
person_df(doc)

## Pruebas de manipulacion

In [None]:
for i, sent in enumerate(doc.sentences):
  sent.print_tokens()

In [None]:
# Iterate over all tokens in all sentences
for i, sent in enumerate(doc.sentences):    
    for t in sent.tokens:
        print(t.text)

In [None]:
# Iterate over all words in all sentences
for i, sent in enumerate(doc.sentences):    
    for w in sent.words:
        print(w.text)

In [None]:
# Iterate over all entities in all sentences
for i, sent in enumerate(doc.sentences):    
    for e in sent.entities:
        print(e.text)

In [None]:
# Iterate over all llemmas in all sentences
for i, sent in enumerate(doc.sentences):    
    for w in sent.words:
        print(w.lemma)

# Prueba de Vectorizacion de un Sumario

In [18]:
def print_doc_info(doc):
    print(f"Num sentences:\t{len(doc.sentences)}")
    print(f"Num tokens:\t{doc.num_tokens}")
    print(f"Num words:\t{doc.num_words}")
    print(f"Num entities:\t{len(doc.entities)}")

In [19]:
print_doc_info(doc)

Num sentences:	47
Num tokens:	2008
Num words:	2013
Num entities:	92


In [20]:
print(*[f'{word.lemma}' for sent in doc.sentences for word in sent.words])

texto completo a c U e R D o en el ciudad de el Plata , a 15 de mayo de 2019 , haber él establecer , de conformidad con él dispuesto en el Acuerdo 2078 , que deber observar él el siguiente orden de votación : doctor Soria , de Lázzari , Genoud , Kogan , él reunir el señor Jueces de el Suprema Corte de Justicia en acuerdo ordinario para pronunciar sentencia definitivo en el causa C. 122321 , " F. , R. . Determinación de el capacidad jurídico " . a N T e C e D e N T e S el Sala I de el Cámara de Apelación en él Civil y Comercial de San Isidro confirmar el fallo de primero instancia que , en el auto sobre determinación de el capacidad jurídico del señor R. F. , resolver aprobar el rendición de cuenta efectuado por el Curaduría Oficial de Alienados ( v. fs . 689 y 660 , respectivamente ) . él interponer , por el titular de el Asesoría de Incapaces n° 2 departamental , recurso extraordinario de inaplicabilidad de ley ( v. fs . 705/711 ) . oído el señor Procurador General , dictado el provid

In [None]:
lemas = [word.lemma for sent in doc.sentences for word in sent.words]

In [None]:
lemas

# Bibliografia

- https://medium.com/@severinperez/exploring-literature-with-the-stanza-nlp-package-927d5b6556bf 
- https://colab.research.google.com/github/stanfordnlp/stanza/blob/master/demo/Stanza_CoreNLP_Interface.ipynb#scrollTo=ezEjc9LeV2Xs
- https://scikit-learn.org/stable/modules/feature_extraction.html 



# Prueba de Trabjo en Corspus

In [None]:
fallos = tbfallos['textos_fallo']

In [None]:
fallos

In [None]:
fallos = fallos.replace("#","\n\n")
fallos = fallos.values.tolist()

In [None]:
nlp = stanza.Pipeline(lang='es', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
doc = nlp(fallos)  

2020-10-27 15:18:43 INFO: Loading these models for language: es (Spanish):
| Processor | Package |
-----------------------
| tokenize  | ancora  |
| mwt       | ancora  |
| pos       | ancora  |
| lemma     | ancora  |

2020-10-27 15:18:43 INFO: Use device: gpu
2020-10-27 15:18:43 INFO: Loading: tokenize
2020-10-27 15:18:43 INFO: Loading: mwt
2020-10-27 15:18:43 INFO: Loading: pos
2020-10-27 15:18:44 INFO: Loading: lemma
2020-10-27 15:18:44 INFO: Done loading processors!


In [None]:
def print_doc_info(doc):
    print(f"Num sentences:\t{len(doc.sentences)}")
    print(f"Num tokens:\t{doc.num_tokens}")
    print(f"Num words:\t{doc.num_words}")
    print(f"Num entities:\t{len(doc.entities)}")

In [None]:
print_doc_info(doc)

Num sentences:	47
Num tokens:	2008
Num words:	2013
Num entities:	0


In [None]:
print(*[f'{word.lemma}' for sent in doc.sentences for word in sent.words])

In [None]:
lemas = [word.lemma for sent in doc.sentences for word in sent.words]

In [None]:
lemas

In [None]:
import spacy
from collections import Counter

In [None]:
word_freq = Counter(lemas)
common_words = word_freq.most_common(5)

In [None]:
common_words

[('el', 208), ('de', 140), (',', 121), ('.', 79), ('y', 49)]

# Continuar con armado de matriz docxword con STANZA

# Word count Matrix of documents with SKLEARN


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
tbfallos['textos_fallo']

0      #TEXTO COMPLETO#A C U E R D O#En la ciudad de ...
1      #TEXTO COMPLETO#A C U E R D O#En la ciudad de ...
2      #TEXTO COMPLETO#"VOSS SUSANA BEATRIZ C/ VILLAR...
3      #TEXTO COMPLETO#A C U E R D O#En la ciudad de ...
4      #TEXTO COMPLETO#"RICA ARIEL FEDERICO Y OTROS C...
                             ...                        
909    #TEXTO COMPLETO#A C U E R D O#En la ciudad de ...
910    #TEXTO COMPLETO#A C U E R D O#En la ciudad de ...
911    #TEXTO COMPLETO#A C U E R D O#En la ciudad de ...
912    #TEXTO COMPLETO#A C U E R D O#En la ciudad de ...
913    #TEXTO COMPLETO#A C U E R D O#En la ciudad de ...
Name: textos_fallo, Length: 914, dtype: object

In [None]:
df = tbfallos['textos_fallo']

In [None]:
cv = CountVectorizer()
cv.fit(df)
results = cv.transform(df)

In [None]:
print(results.shape) # Sparse matrix

(914, 44226)


In [None]:
features = cv.get_feature_names()
df_res = pd.DataFrame(results.toarray(), columns=features)

In [None]:
df_res.head()

Unnamed: 0,00,000,0000,00000016,00000018,00000235,00000261,00000262,00000365,0000056,00000565,00000566,00000570,00000602,00000603,00000622,00000673,00000770,00000771,00000833,00001,00002772,00003413,00003417,00003907,00003908,00008571,00009,0001,000156,00018462,0002,000325,0006,000663,000969,0009867,001,001026,001133,...,ítem,ítems,íter,ñamandú,ñata,óbice,óbices,óbito,ómnibus,óntico,óptica,óptima,óptimas,óptimo,órbita,órbitas,órdenes,órgano,órganos,ósea,óseas,óseo,ótico,óvulos,últ,última,últimamente,últimas,últimaáratioádel,último,últimos,única,únicamente,únicas,único,únicos,útero,útil,útiles,útilmente
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,7,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
