In [1]:
import nltk
import regex as re
import pandas as pd
import numpy as np
import string
from fuzzywuzzy import fuzz
import locale
locale.setlocale(locale.LC_ALL,'es_ES.UTF-8') # set to spanish
import spacy
from pathlib import Path

In [2]:
df = pd.read_csv("sectioned_text.csv")

In [3]:
df

Unnamed: 0,name,page,section_text,percent_error
0,ABADIA ARANGO SERGIO,7,.. Abogado. Especialidad: Minas y Petróleos. ...,0.000000
1,ABAUNZA JIMENEZ JOSE C,7,.. Abogado. Especialidad: Derecho Civil.. Naci...,0.000000
2,ACERO PIMENTEL ENRIQUE,7,.. Abogado. Especialidad: Derecho Administrati...,0.086505
3,AGUILERA CAMACHO ALBERTO,7,.. Abogado. Especialidad: Derecho del Trabajo ...,0.000000
4,ALBORNOZ ROSAS CARLOS ALBERTO,8,.. Abogado. Especialidad: Derecho Penal.. Nac...,0.000000
...,...,...,...,...
1182,REDER GRANIER CARLOS,363,".. Odontólogo.. Nació, Budapest, Hungría, feb...",0.136799
1183,RODRIGUEZ BOHORQUEZ RAFAEL MARIA,363,".. Odontólogo.. Nació. Guateque, Boyacá, ma...",0.000000
1184,ROZENTAL LERNER SALVADOR,363,".. Odontólogo. Médico.. Nació, Edinita, Besa...",0.172315
1185,SEPULVEDA C. ANTONIO M,364,.. Odontólogo. Especialidad: Cirugía Oral.. N...,0.000000


In [4]:
cat = ["Nació", "Esposa", "Hijos", "Ha sido","Miembro", "Bachillerato", "Estudios profesionales", 
        "Estudios de literatura", "Estudios secundarios", "Estudios de especialización", "Residencia", 
        "Oficina", "Gabinete", "Dirección"]

ori_cat = ["Nació", "Esposa","Esposo", "Hijos", "Ha sido", "Miembro:", "Bachillerato", 
            "Estudios profesionales", "Estudios de literatura", "Estudios secundarios", "Tesis", 
            "Residencia", "Oficina", "Gabinete", "Idiomas", "Autor de", "Ha publicado", "Ha visitado", 
            "Visitado", "Paises visitados", "Su afición", "Aficiónes" "Cables", "Condecoraciones","Telégrafo", 
            "Teléfono", "Es miembro", "Especialidad", "Deportes", "Dirección:", "Estudios de especialización", 
            "Residencia y Gabinete"]

df["info"] = ''
df["profession"]=''
for c in cat[:-3]:
    df[c] = ''
df["birthdate"] = ''
df["party"] = ''

In [5]:
# Load the saved model and predict
output_dir = Path('/content/')
nlp = spacy.load(output_dir)

In [6]:
# Load regular model
nlp_ori = spacy.load("es_core_news_sm")

In [7]:
# Load Colombian place names
colombia = pd.read_csv("cities_colombia.csv") 
cities = colombia["city"]
admin = np.unique(colombia["admin_name"])

In [8]:
def closest_match(text, word_list, threshold = 90):
    """ 
    Determine the word that most closely matches a word
    in the word list.

    Inputs:
    text (str): Text to search through
    word_list (list): List of words to compare against
    
    Returns:
    match (list): Words in word list that matches
    repl (list): Words from text that matched

    """
    search_list = re.findall("("+"|".join(word_list)+")"+'{s<=2:[\p{Lu}]}', text)
    match = [] 
    repl = []
    if search_list:
        for t in search_list:
            ratio = [fuzz.ratio(t, w) for w in word_list]
            if max(ratio) > threshold:
                match.append(word_list[np.argmax(np.array(ratio))])
                repl.append(t)
    return match, repl

In [9]:
to_strip = """'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~—“” """ # Remove trailing punctuation
months = ["enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", 
            "agosto", "septiembre", "octubre", "noviembre", "diciembre"]

for i, text in enumerate(df.section_text):
    # Identify categories
    category = re.findall("("+"|".join(ori_cat)+")" +'{i<=1,s<=2,e<=2}', text) 
    split_point = []
    for c in ori_cat:
        ratio = [fuzz.ratio(c,word) for word in category]
        if max(ratio)>=80:
            split_point.append(ratio.index(max(ratio)))
    split_point.sort()
    category = np.array(category)[split_point]
    
    # Remove duplicate categories
    _, idx = np.unique(category, return_index=True)
    category = category[np.sort(idx)]
    
    # Split text based on categories found
    t = []
    for word in category:
        split_text = text.split(word, 1)
        text = split_text[1]
        t.append(split_text[0])
    t.append(text)

    # Extract profession
    profession= nltk.sent_tokenize(t[0])
    profession = [word.strip(to_strip) for word in profession if len(word) >=1]
    df["profession"][i] = profession

    # Ensure category names match main
    selected_index = []
    selected_cat = []
    for c in cat:
        sim = [fuzz.ratio(c, word) for word in category]
        if max(sim) >=80:
            selected_index.append(sim.index(max(sim)))
            selected_cat.append(c)
    entry = [t[1:][k].strip(to_strip) for k in selected_index]
    info = dict(zip(selected_cat, entry))
    df["info"][i]= info

    if "Nació" in info:
        info_text = info["Nació"].replace("Soltero", "")
        mth_found = re.findall("("+"|".join(months)+")" +'{i<=1,s<=2,e<=2}', info_text)
        if not mth_found:
            mth = "MM"
        else:
            fuzz_ratio = []
            for m in mth_found:
                ratio = [fuzz.ratio(m.strip(to_strip).lower(), word) for word in months]
                fuzz_ratio.append([ratio.index(max(ratio)), max(ratio)])
            idx = fuzz_ratio[np.argmax(np.array(fuzz_ratio), axis = 0)[1]][0]
            mth = "0"+ str(idx+1) if idx <9 else str(idx+1)
            info_text.replace(mth_found[np.argmax(np.array(fuzz_ratio), axis = 0)[1]], "")
        try:
            date = re.search(r"[\d]{1,2}[,\ ]{1}", info_text).group(0)
            if len(date)<2:
                date = "0"+date
        except:
            date = "DD"
        try:
            year = re.search(r"[\d]{4}", info_text).group(0)
        except:
            year = "YYYY"
        df["birthdate"][i] = "".join([year, mth, date]).strip(", ")

        birth_city, repl = closest_match(info_text, cities)
        for r in repl:
            info_text.replace(r, "")

        birth_admin, repl = closest_match(info_text, admin)
        for r in repl:
            info_text.replace(r, "")
        
        bplace = nlp_ori(info_text)
        birthplace = [ent.text for ent in bplace.ents if ent.label_ == "LOC"]
        df["Nació"][i] = list(set(birth_city+birth_admin+birthplace))
        
    if "Hijos" in info:
        children = re.split(",|\ y\ ", info["Hijos"])
        children = [child.strip() for child in children]
        df["Hijos"][i] = children

    if "Esposa" in info:
        spouse = re.split("[.,]", info["Esposa"])
        df["Esposa"][i] = spouse[0]

    party = []
    
    for c in ["Ha sido", "Miembro", "Bachillerato", "Estudios profesionales", 
                "Estudios de literatura", "Estudios secundarios", 
                "Estudios de especialización"]:
        if c in info:
            party+= re.findall("(conservador|liberal)", info[c], flags = re.IGNORECASE)

            attr_nlp = nlp(info[c])
            attr_details = {
                "org": [ent.text for ent in attr_nlp.ents if ent.label_ == "ORGANISATION"],
                "loc": [ent.text for ent in attr_nlp.ents if ent.label_ == "LOCATION"],
                "position": [ent.text for ent in attr_nlp.ents if ent.label_ == "POSITION"],
                "date": [ent.text for ent in attr_nlp.ents if ent.label_ == "DATE"]
            }

            df[c][i] = attr_details

    if party:
        df["party"][i] = party[0].lower()
    else:
        df["party"][i] = "None"

    for attr in ["Residencia","Dirección", "Oficina", "Gabinete"]:
        if attr in info:
            loc = re.split("(Teléfono|Telégrafo|Cables){i<=1,s<=2,e<=2}", info[attr])[0]
            if len(loc)>5:
                df["Residencia"][i] = {"location": loc, "tag": attr}
                break
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["profession"][i] = profession
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["info"][i]= info
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["birthdate"][i] = "".join([year, mth, date]).strip(", ")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Nació"][i] = list(set(birth_city+birth_admin+birthp

In [10]:
df

Unnamed: 0,name,page,section_text,percent_error,info,profession,Nació,Esposa,Hijos,Ha sido,Miembro,Bachillerato,Estudios profesionales,Estudios de literatura,Estudios secundarios,Estudios de especialización,Residencia,birthdate,party
0,ABADIA ARANGO SERGIO,7,.. Abogado. Especialidad: Minas y Petróleos. ...,0.000000,"{'Nació': 'Istmina, Chocó, agosto 10, 1895', '...",[Abogado],"[Istmina, Chocó]",Regina Chamat,"[Paz Cecilia, Reginita, Alma Victoria, Sergio ...",{'org': ['The Bristish Platinum and Gold Corpo...,,{'org': ['Colegio Mayor de Nuestra Señora del ...,,,,,"{'location': 'Avenida 4 N 10-64', 'tag': 'Resi...",18950810,
1,ABAUNZA JIMENEZ JOSE C,7,.. Abogado. Especialidad: Derecho Civil.. Naci...,0.000000,"{'Nació': 'Santana, Boyacá, abril 11923', 'Esp...",[Abogado],"[Boyacá, Santana]",Rosa B Forero,"[Calya Dalila, Belkis Damaris. Carlos José, Na...","{'org': ['Tribunal Superior', 'Intervencionism...",,"{'org': ['Colegio Boyacá'], 'loc': ['Tunja'], ...",,,,,"{'location': 'Santa Rosa de Viterbo. Boyacá', ...",119204DD,
2,ACERO PIMENTEL ENRIQUE,7,.. Abogado. Especialidad: Derecho Administrati...,0.086505,"{'Nació': 'Bogotá, diciembre 1, 1904', 'Esposa...",[Abogado],[Bogotá],Paulina Rodríguez Duarte,"[Fernando Raúl, Gloria, Germán]","{'org': ['Gobierno Cundinamarca', 'Departament...","{'org': ['Academia de Jurisprudencia'], 'loc':...",,"{'org': ['Universidad Libre'], 'loc': [], 'pos...",,,,"{'location': 'y Jefe de Personal de ""El Tiempo...",1904121,
3,AGUILERA CAMACHO ALBERTO,7,.. Abogado. Especialidad: Derecho del Trabajo ...,0.000000,"{'Nació': 'Subachoque, Cundinamarca, noviembre...",[Abogado],"[Subachoque, Cundinamarca]",Esther Blanco,"[Jaime, Hernando]","{'org': ['Colegio de Abogados', 'Primer Congre...","{'org': ['Colegio de Abogados del Trabajo', 'A...",{'org': ['Colegio Mayor de Nuestra Señora del ...,"{'org': [], 'loc': ['Bogotá'], 'position': ['E...",,,,"{'location': 'Carrera 7 N 14-28', 'tag': 'Gabi...",19071121,
4,ALBORNOZ ROSAS CARLOS ALBERTO,8,.. Abogado. Especialidad: Derecho Penal.. Nac...,0.000000,"{'Nació': 'Pasto, marzo 5, 1915. Soltero', 'Ha...",[Abogado],[Pasto],,,"{'org': ['Universidad de Narino', 'Asamblea de...","{'org': ['Centro de Estudios Sociales', 'Coleg...","{'org': ['Colegio Javeriano', 'Colegio de la I...","{'org': ['Universidad de Nariño', 'Universidad...",,,,"{'location': 'Carrera 9 N 13-87', 'tag': 'Gabi...",1915035,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1182,REDER GRANIER CARLOS,363,".. Odontólogo.. Nació, Budapest, Hungría, feb...",0.136799,"{'Nació': 'Budapest, Hungría, febrero 14, 1917...",[Odontólogo],[Budapest],Yda Lander,,"{'org': [], 'loc': [], 'position': [], 'date':...",,,"{'org': ['Universidad Nacional'], 'loc': [], '...",,"{'org': [], 'loc': [], 'position': ['Liceo Mod...",,"{'location': 'carrera 15 N 33-17', 'tag': 'Res...",19170214,
1183,RODRIGUEZ BOHORQUEZ RAFAEL MARIA,363,".. Odontólogo.. Nació. Guateque, Boyacá, ma...",0.000000,"{'Nació': 'Guateque, Boyacá, mayo 5, 1987', 'E...",[Odontólogo],"[Boyacá, Guateque]",Victoria Cristo Vélez,,"{'org': ['Asamblea Cundinamarca', 'Fundador Bo...",,"{'org': ['Colegio de Ramirez', 'Universidad Re...","{'org': ['Escuela Dental Nacional'], 'loc': []...",,,,"{'location': 'carrera 13 N 23-05', 'tag': 'Gab...",1987055,
1184,ROZENTAL LERNER SALVADOR,363,".. Odontólogo. Médico.. Nació, Edinita, Besa...",0.172315,"{'Nació': 'Edinita, Besarabia, febrero 25 1912...","[Odontólogo, Médico]",[Besarabia],Ruthy Klinger,"[Israel León, Manuel Eduardo]","{'org': [], 'loc': [], 'position': [], 'date':...",{'org': ['Universidad de Santiago de Compostel...,"{'org': ['Instituto La Coruña'], 'loc': [], 'p...","{'org': ['Santiago de Compostela'], 'loc': ['M...",,,,{'location': 'de Enlace de Israel ante el Gobi...,19120225,
1185,SEPULVEDA C. ANTONIO M,364,.. Odontólogo. Especialidad: Cirugía Oral.. N...,0.000000,"{'Nació': 'Piedecuesta, Santander, enero 11. 1...",[Odontólogo],"[Piedecuesta, Santander]",Beatriz Vanegas Narváez,,"{'org': [], 'loc': [], 'position': ['Secretari...",,"{'org': ['Liceo Comercial', 'Colegio Univercit...","{'org': ['Colegio Dental'], 'loc': ['Bogotá', ...",,,,"{'location': 'de ""El Tiempo"", 1932; Representa...",189501DD,


In [11]:
df.to_csv("results.csv", encoding = 'utf-8-sig', index=False)
df.to_pickle("results.pkl")