# Introduction

This noteboook is to explore the identified location entities and see how to extract addresses by joining them.

In [2]:
import pandas as pd

In [6]:
entities = pd.read_csv("../files/entities.csv")
print entities.shape
entities.head()

(997, 6)


Unnamed: 0,art_id,sent_id,cs_id,pos,entity,dataset
0,3,0,0,0,SAN PEDRO SULA,train
1,3,1,1,28,Sula,train
2,3,2,2,13,San Pedro Sula,train
3,3,3,3,38,Lomas del,train
4,3,5,5,21,la Madre,train


In [32]:
print "number of articles: %i" % len(entities.art_id.unique())
print entities.art_id.unique()[:10]

print "number of unique entities: %i"% len(entities.entity.unique())

# transform to lower case the entities
entities.loc[:,"lower"] = entities.entity.apply(lambda x: x.decode("utf-8").lower() )
print "number of unique lower entities: %i"% len(entities.lower.unique())

entities.lower.sort_values().unique()

number of articles: 205
[ 3  5  9 11 12 13 20 24 30 32]
number of unique entities: 393
number of unique lower entities: 372


array([u'14 de enero', u'20 de noviembre', u'21 de octubre',
       u'24 de abril', u'3 de mayo', u'6', u'abajo', u'agua',
       u'agua caliente', u'alemania', u'almoloya de ju\xe1rez', u'altos',
       u'alvarado', u'amarateca', u'anillo perif\xe9rico', u'antioquia',
       u'apaguiz', u'atl\xe1ntico', u'atl\xe1ntida', u'auka', u'azacualpa',
       u'barandillas', u'bel\xe9n', u'bengala', u'bogot\xe1', u'boulai',
       u'brasilia', u'brisas', u'brisas del merend\xf3n de',
       u'buenos aires', u'caba\xf1as', u'caloto', u'calpules',
       u'campo cielo', u'cana\xe1n', u'cartagena', u'car\xedas',
       u'casa presidencial', u'catacamas', u'cauca', u'centro',
       u'centro de rehabilitaci\xf3n de menores',
       u'centro de rehabilitaci\xf3n integral telet\xf3n', u'centro penal',
       u'cerrito lindo', u'cerro grande', u'chamelec\xf3n', u'chiquila',
       u'chiquimula', u'choloma', u'choluteca', u'ciudad de m\xe9xico',
       u'coatzacoalcos', u'cofrad\xeda', u'cololaca', u'c

In [17]:
def getArticleEntities(df, art_ids):
    return df[ df["art_id"].isin( art_ids) ]

In [33]:
getArticleEntities(entities, entities.art_id.unique()[:10] )

Unnamed: 0,art_id,sent_id,cs_id,pos,entity,dataset,lower
0,3,0,0,0,SAN PEDRO SULA,train,san pedro sula
1,3,1,1,28,Sula,train,sula
2,3,2,2,13,San Pedro Sula,train,san pedro sula
3,3,3,3,38,Lomas del,train,lomas del
4,3,5,5,21,la Madre,train,la madre
5,3,7,7,11,San Pedro Sula,train,san pedro sula
6,5,0,10,0,TEGUCIGALPA,train,tegucigalpa
7,9,0,21,0,TEGUCIGALPA,train,tegucigalpa
8,9,8,29,9,Kennedy,train,kennedy
9,11,0,40,0,PUERTO CORTES,train,puerto cortes


# Use a gazete

In [60]:
countries = [u"honduras",u"el salvador", u"eeuu", u"mexico", u"colombia"]
states = [
    u"francisco morazán", u"cortés", u"yoro", u"olancho" 
]

cities = [
    u"tegucigalpa", u"san pedro sula", u"yoro", u"la ceiba", u"juticalpa", u"valle de ángeles", 
    u"santa lucía", u"puerto cortes", u'choloma'
]

In [35]:
getArticleEntities(entities, [3] )

Unnamed: 0,art_id,sent_id,cs_id,pos,entity,dataset,lower
0,3,0,0,0,SAN PEDRO SULA,train,san pedro sula
1,3,1,1,28,Sula,train,sula
2,3,2,2,13,San Pedro Sula,train,san pedro sula
3,3,3,3,38,Lomas del,train,lomas del
4,3,5,5,21,la Madre,train,la madre
5,3,7,7,11,San Pedro Sula,train,san pedro sula


# Naive / simplistic Approach
**Assumptions:**
- Each article is related to only one City - Department - Country
- Every address is related to the main city-department-country



In [79]:

def getAddressesNaive(df, art_id):
    global countries
    global cities
    global states
    
    art_entities = df[ df["art_id"] == art_id ]
    
    country = ""
    state = ""
    city = ""
    
    adds = []
    counts = {
        "country": 0,
        "state": 0,
        "city":0
    }
    for ix, entity in art_entities.iterrows():
        if entity.lower in countries:
            country = ", " + entity.lower
            counts["country"] += 1
        elif entity.lower in states:
            state = ", " + entity.lower
            counts["state"] += 1
        elif entity.lower in cities:
            state = ", " + entity.lower
            counts["city"] += 1
        else:
            address = entity.lower 
            adds.append(address)
            
    adds = [address + city + state +  country  for address in adds ]
            
    if len(adds) == 0:
        adds.append(city + state +  country )
        
    print counts
        
    return adds
            
    

In [80]:
# 3  5  9 11 12 13 20 24 30 32
getAddressesNaive(entities, 3)

{'country': 0, 'state': 0, 'city': 3}


[u'sula, san pedro sula',
 u'lomas del, san pedro sula',
 u'la madre, san pedro sula']

## some questions to ask

- What is the end result (raw) that I want to achieve?  => Answer: examples ....Art 123: [ "Kenedy, Tegucigalpa" ]
- How to get that results?  
   => Finding patterns? using gazetes? (what about new unknown cities or cities with same names?)  
   => changing the iob-tags?? i.e.  B-Zone, B-City, B-State, B-Country
   => Research more