# Intro

This notebook is to generate the end result labels (i.e. the addresses that will be used in the google maps api).The generated dataset will be used to test/score the end-to-end pipeline. The input are the entities and relationships that are labeled in the criminal articles dataset. 

In [17]:
import pandas as pd
import numpy as np
import ast 
import unicodedata

In [24]:
articles = pd.read_csv("../files/criminal_articles.csv")
articles.head()

Unnamed: 0,article_id,title,content,relationships
0,3,"[[u'Citan', u'none'], [u'a', u'none'], [u'11',...","[[[u'SAN', u'B-City'], [u'PEDRO', u'I-City'], ...","[[{u'tag': u'B-Col', u'word': u'lomas del carm..."
1,5,"[[u'DEI', u'none'], [u'pide', u'none'], [u'den...","[[[u'TEGUCIGALPA', u'B-City']], [[u'-', u'none...",[]
2,9,"[[u'Alcald\xeda', u'none'], [u'intensifica', u...","[[[u'TEGUCIGALPA', u'B-City']], [[u'-', u'none...","[[{u'tag': u'B-Col', u'word': u'kennedy'}, {u'..."
3,11,"[[u'Pasajeros', u'none'], [u'asaltantes', u'no...","[[[u'PUERTO', u'B-City'], [u'CORTES', u'I-City...","[[{u'tag': u'B-City', u'word': u'puerto cortes..."
4,12,"[[u'Fallece', u'none'], [u'comerciante', u'non...","[[[u'JUTICALPA', u'B-City'], [u',', u'none'], ...","[[{u'tag': u'B-City', u'word': u'juticalpa'}, ..."


# preprocess

In [25]:
# very useful functions to avoid mispellings problems.
# you'll thank me later
def to_ascii(s):
    return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')

def sent_to_ascii(sentences):
    for sent_ix, sent in enumerate(sentences):
        for word_ix, word in enumerate(sent):
            sentences[sent_ix][word_ix][0] =  to_ascii(word[0])
    return sentences
            
def rel_to_ascii(relationships):
    for rel_ix, rel in enumerate(relationships):
        relationships[rel_ix][0]["word"] = to_ascii(relationships[rel_ix][0]["word"])
        relationships[rel_ix][1]["word"] = to_ascii(relationships[rel_ix][1]["word"])
    return relationships

In [26]:
# convert string into array
articles.loc[:,"title"] = articles.title.apply(lambda x: ast.literal_eval(x) )
articles.loc[:,"content"] = articles.content.apply(lambda x: ast.literal_eval(x) )
articles.loc[:,"relationships"] = articles.relationships.apply(lambda x: ast.literal_eval(x) )
#convert to ascii
articles.title = articles.title.apply(lambda x: sent_to_ascii([x]) if x != None else x )
articles.content = articles.content.apply(lambda x: sent_to_ascii(x) if x != None else x )
articles.relationships = articles.relationships.apply(lambda x: rel_to_ascii(x) if x != None else x )

# get all the full text
articles.loc[:,"full_text"] = articles.title.apply(lambda x: x) + articles.content 

In [27]:
# format relationships
def format_relationship_tags(relationships):

    for ix, rel in enumerate(relationships):
        relationships[ix][0]["tag"] = rel[0]["tag"][2:]
        relationships[ix][1]["tag"] = rel[1]["tag"][2:]
    return relationships

articles.relationships.apply(lambda x: format_relationship_tags(x))
articles.relationships.head()

0    [[{u'tag': u'Col', u'word': u'lomas del carmen...
1                                                   []
2    [[{u'tag': u'Col', u'word': u'kennedy'}, {u'ta...
3    [[{u'tag': u'City', u'word': u'puerto cortes'}...
4    [[{u'tag': u'City', u'word': u'juticalpa'}, {u...
Name: relationships, dtype: object

# Extract Entities

In [28]:
def getEntities(content):
    entities = []
    for sent in content:
        for word in sent:
            if word[1][0] == "B":
                entities.append({"word": word[0].lower(),"tag":word[1][2:]})
            elif word[1][0] == "I":
                entities[-1]["word"] += " " + word[0].lower()
                
    #eliminate duplicates
    unique_entities = []
    for itm in entities:
        if itm not in unique_entities:
            unique_entities.append(itm)
                
    return unique_entities

In [29]:
articles.loc[:,"entities"] = articles.full_text.apply(lambda x: getEntities(x))

In [59]:
#change some entities that are equivalent
def entity_equivalencies(entities):
    eq_map = {
        "comayaguela":"tegucigalpa",
        "distrito central":"tegucigalpa",
        "eeuu":"estados unidos",
        "ee.uu.":"estados unidos",
        "ee. uu.":"estados unidos",
    }
    for idx, entity in enumerate(entities):
        if entity in eq_map:
            entities[idx] = eq_map[entity]
            
    return entities

def rel_equivalencies(relationships):
    eq_map = {
        "comayaguela":"tegucigalpa",
        "distrito central":"tegucigalpa",
        "eeuu":"estados unidos",
        "ee.uu.":"estados unidos",
        "ee. uu.":"estados unidos",
    }
    
    for rel_idx, rel in enumerate(relationships):
        if rel[0]["word"] in eq_map:
            relationships[rel_idx][0]["word"] = eq_map[rel[0]["word"]]
        if rel[1]["word"] in eq_map:
            relationships[rel_idx][1]["word"] = eq_map[rel[1]["word"]]
            
    return relationships
    
    
            


# articles.loc[:,"entities"] = articles.entities.apply(lambda x: eq_map[x] if x in eq_map else x)
    
for _, article  in articles.loc[:10].iterrows():
    print article.entities

[{'tag': u'City', 'word': 'san pedro sula'}, {'tag': u'Col', 'word': 'lomas del carmen'}, {'tag': u'Zone', 'word': 'monumento a la madre'}, {'tag': u'Zone', 'word': 'primera avenida'}, {'tag': u'Zone', 'word': 'parque central'}]
[{'tag': u'City', 'word': 'tegucigalpa'}]
[{'tag': u'City', 'word': 'tegucigalpa'}, {'tag': u'Col', 'word': 'kennedy'}, {'tag': u'City', 'word': 'distrito central'}]
[{'tag': u'City', 'word': 'puerto cortes'}, {'tag': u'State', 'word': 'cortes'}, {'tag': u'City', 'word': 'choloma'}, {'tag': u'Col', 'word': 'lopez arellano'}, {'tag': u'Zone', 'word': 'trincheras'}]
[{'tag': u'City', 'word': 'juticalpa'}, {'tag': u'State', 'word': 'olancho'}, {'tag': u'Country', 'word': 'honduras'}, {'tag': u'City', 'word': 'san esteban'}]
[{'tag': u'City', 'word': 'el negrito'}, {'tag': u'State', 'word': 'yoro'}, {'tag': u'Zone', 'word': 'la vuelta del sapo'}]
[{'tag': u'State', 'word': 'colon'}, {'tag': u'City', 'word': 'sonaguera'}]
[{'tag': u'City', 'word': 'valle de angeles'

# get Single Entities

In [32]:
def find_singles(df):
    singles_column =[]
    for ix, article in df.iterrows():
        childs = [ x[0] for x in article.relationships ]
        parents = [ x[1] for x in article.relationships ]
        
        singles =[]
        for entity in article.entities:

            if entity not in childs and entity not in parents:
                singles.append(entity)
                
        singles_column.append(singles)
    
    df.loc[:,"singles"] = singles_column
    df.loc[:,"singles_count"] = df.singles.apply(lambda x: len(x))
    return df

find_singles(articles)[["singles","singles_count" ]].head()
articles.head()

Unnamed: 0,article_id,title,content,relationships,full_text,entities,singles,singles_count
0,3,"[[[Citan, none], [a, none], [11, none], [perso...","[[[SAN, B-City], [PEDRO, I-City], [SULA, I-Cit...","[[{u'tag': u'Col', u'word': u'lomas del carmen...","[[[Citan, none], [a, none], [11, none], [perso...","[{u'tag': u'City', u'word': u'san pedro sula'}...",[],0
1,5,"[[[DEI, none], [pide, none], [denunciar, none]...","[[[TEGUCIGALPA, B-City]], [[-, none], [Autorid...",[],"[[[DEI, none], [pide, none], [denunciar, none]...","[{u'tag': u'City', u'word': u'tegucigalpa'}]","[{u'tag': u'City', u'word': u'tegucigalpa'}]",1
2,9,"[[[Alcaldia, none], [intensifica, none], [oper...","[[[TEGUCIGALPA, B-City]], [[-, none], [Pese, n...","[[{u'tag': u'Col', u'word': u'kennedy'}, {u'ta...","[[[Alcaldia, none], [intensifica, none], [oper...","[{u'tag': u'City', u'word': u'tegucigalpa'}, {...","[{u'tag': u'City', u'word': u'distrito central'}]",1
3,11,"[[[Pasajeros, none], [asaltantes, none], [acri...","[[[PUERTO, B-City], [CORTES, I-City], [,, none...","[[{u'tag': u'City', u'word': u'puerto cortes'}...","[[[Pasajeros, none], [asaltantes, none], [acri...","[{u'tag': u'City', u'word': u'puerto cortes'},...",[],0
4,12,"[[[Fallece, none], [comerciante, none], [olanc...","[[[JUTICALPA, B-City], [,, none], [Olancho, B-...","[[{u'tag': u'City', u'word': u'juticalpa'}, {u...","[[[Fallece, none], [comerciante, none], [olanc...","[{u'tag': u'City', u'word': u'juticalpa'}, {u'...",[],0


# get Paths

In [79]:
# get to the root
def getPath( ix, children, parents):
    tag_path = children[ix]["tag"]
    word_path = children[ix]["word"]
    incomplete = 0
    missing_country = 0
    missing_city = 0
    
    if parents[ix] in children:
        p_ix = children.index(parents[ix])
        pt_path, pw_path, incomplete, missing_country, missing_city = getPath(p_ix, children, parents)
        tag_path += "-" + pt_path
        word_path += ", " + pw_path
    else:
        tag_path += "-" + parents[ix]["tag"]
        word_path += ", " + parents[ix]["word"] 
        if parents[ix]["tag"] != "Country":
#             print "--- INCOMPLETE Country---"
            incomplete = 1
            missing_country = 1
        if children[ix]["tag"] not in ["Country","State","City"] and parents[ix]["tag"] != "City":
            incomplete = 1
            missing_city = 1
#             print "--- INCOMPLETE Zone---"
        
    return tag_path, word_path, incomplete, missing_country, missing_city

data = []
for _, article in articles.iterrows():
    children = [ x[0] for x in article.relationships ]
    parents = [ x[1] for x in article.relationships ]

    for ix, entity in enumerate(children):        
        #must be a leaf not part of a branch
        if entity not in parents:
            data.append((article.article_id,) +getPath(ix, children, parents))
    
    #singles
    for entity in article.singles:
        missing_country = 1
        missing_city = 1
        incomplete = 1
        if entity["tag"] == "Country":
            missing_country = 0
            incomplete = 0
        elif entity["tag"] == "City":
            missing_city = 0
        
        data.append((article.article_id,  entity["tag"], entity["word"], incomplete, missing_country, missing_city))

addresses = pd.DataFrame(data,columns=["art_id","shape","address","incomplete", "missing_country", "missing_city"])
addresses.head()

Unnamed: 0,art_id,shape,address,incomplete,missing_country,missing_city
0,3,Col-City,"lomas del carmen, san pedro sula",1,1,0
1,3,Zone-City,"monumento a la madre, san pedro sula",1,1,0
2,3,Zone-City,"primera avenida, san pedro sula",1,1,0
3,3,Zone-City,"parque central, san pedro sula",1,1,0
4,5,City,tegucigalpa,1,1,0


In [80]:
addresses[addresses["missing_country"]==1]

Unnamed: 0,art_id,shape,address,incomplete,missing_country,missing_city
0,3,Col-City,"lomas del carmen, san pedro sula",1,1,0
1,3,Zone-City,"monumento a la madre, san pedro sula",1,1,0
2,3,Zone-City,"primera avenida, san pedro sula",1,1,0
3,3,Zone-City,"parque central, san pedro sula",1,1,0
4,5,City,tegucigalpa,1,1,0
5,9,Col-City,"kennedy, tegucigalpa",1,1,0
6,9,City,distrito central,1,1,0
7,11,Col-City-State,"lopez arellano, puerto cortes, cortes",1,1,0
8,11,Zone-City-State,"trincheras, puerto cortes, cortes",1,1,0
9,11,City-State,"choloma, cortes",1,1,0


# Save the Addresses

In [84]:
addresses.to_csv("../files/expected_addresses.csv", index=False)

In [85]:
addr_2 = pd.read_csv("../files/expected_addresses.csv")
addr_2.head()

Unnamed: 0,art_id,shape,address,incomplete,missing_country,missing_city
0,3,Col-City,"lomas del carmen, san pedro sula",1,1,0
1,3,Zone-City,"monumento a la madre, san pedro sula",1,1,0
2,3,Zone-City,"primera avenida, san pedro sula",1,1,0
3,3,Zone-City,"parque central, san pedro sula",1,1,0
4,5,City,tegucigalpa,1,1,0
