# 0. Intro

Analyze the new data and compare with the old data

In [63]:
import pandas as pd
import numpy as np
import unicodedata

# 1. Documents
Analyze the documents (articles) file 

In [2]:
#read the csv
documents = pd.read_csv("../files/documents.csv")

In [3]:
documents.head()

Unnamed: 0,id,title,content,category,date
0,0,Auditoría revela irregularidades en el Parlacen,GUATEMALA.- Una fiscalización de la Contralorí...,Other,"28 Dic, 2009 - 7:27 pm"
1,1,Suspendidas las citas en Hospital Escuela,TEGUCIGALPA.- Una misteriosa obstrucción del s...,Other,"28 Dic, 2009 - 7:32 pm"
2,2,Mariscos contaminados alarman a los “porteños”,"PUERTO CORTES, Cortés.- Alarmados se encuentra...",Other,"28 Dic, 2009 - 8:25 pm"
3,3,Citan a 11 personas por vender pólvora,SAN PEDRO SULA.- Hasta el momento ocho bodegas...,Criminal,"28 Dic, 2009 - 8:26 pm"
4,4,Con compra de granos se paliaría hambruna en e...,TEGUCIGALPA.- No llueve hace cuatro meses y la...,Other,"29 Dic, 2009 - 1:00 am"


## 1.1 Review the category distribution

In [15]:
def get_dist(df):
    total = df.shape[0]
    print total
    print df.category.value_counts() * 100.0 / total
    
print "- Total"
get_dist(documents) 
print
print "- Old"
get_dist(documents.loc[:1999])  
print
print "- New"
get_dist(documents.loc[2000:])  



- Total
2500
Other       87.64
Criminal    12.36
Name: category, dtype: float64

- Old
2000
Other       88.05
Criminal    11.95
Name: category, dtype: float64

- New
500
Other       86.0
Criminal    14.0
Name: category, dtype: float64


In [13]:
documents.loc[:1999].shape

(2000, 5)

# 2.  Criminal Article addresses

In [66]:
import ast 
#read the csv
criminal = pd.read_csv("../files/criminal_articles.csv")
criminal.head(1)

Unnamed: 0,article_id,title,content,relationships
0,3,"[[u'Citan', u'none'], [u'a', u'none'], [u'11',...","[[[u'SAN', u'B-City'], [u'PEDRO', u'I-City'], ...","[[{u'tag': u'B-Col', u'word': u'lomas del carm..."


In [61]:
# very useful function to avoid mispellings problems.
def to_ascii(s):
    return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')

In [67]:
criminal.title = criminal.title.apply(lambda x: ast.literal_eval(x))
criminal.content = criminal.content.apply(lambda x: ast.literal_eval(x))
criminal.relationships = criminal.relationships.apply(lambda x: ast.literal_eval(x))

def rel_to_ascii(relationships):
    for idx, rel in enumerate(relationships):
        relationships[idx][0]["word"] = to_ascii(rel[0]["word"])
        relationships[idx][1]["word"] = to_ascii(rel[1]["word"])
    return relationships

criminal.relationships = criminal.relationships.apply(lambda x: rel_to_ascii(x))

criminal.head()

Unnamed: 0,article_id,title,content,relationships
0,3,"[[Citan, none], [a, none], [11, none], [person...","[[[SAN, B-City], [PEDRO, I-City], [SULA, I-Cit...","[[{u'tag': u'B-Col', u'word': u'lomas del carm..."
1,5,"[[DEI, none], [pide, none], [denunciar, none],...","[[[TEGUCIGALPA, B-City]], [[-, none], [Autorid...",[]
2,9,"[[Alcaldía, none], [intensifica, none], [opera...","[[[TEGUCIGALPA, B-City]], [[-, none], [Pese, n...","[[{u'tag': u'B-Col', u'word': u'kennedy'}, {u'..."
3,11,"[[Pasajeros, none], [asaltantes, none], [acrib...","[[[PUERTO, B-City], [CORTES, I-City], [,, none...","[[{u'tag': u'B-City', u'word': u'puerto cortes..."
4,12,"[[Fallece, none], [comerciante, none], [olanch...","[[[JUTICALPA, B-City], [,, none], [Olancho, B-...","[[{u'tag': u'B-City', u'word': u'juticalpa'}, ..."


# 2.1 Extract addresses using relationships

In [64]:
def getEntities(content):
    entities = []
    for sent in content:
        for word in sent:
            if word[1][0]=="B":
                entities.append( to_ascii(word[0].lower() ) )
            elif word[1][0] == "I":
                entities[-1] += " " + to_ascii(word[0].lower() )
    return entities

def eliminate_duplicates(entities):
    new_entities = []
    for entity in entities:
        if entity not in new_entities:
            new_entities.append(entity)
            
    return new_entities

entities = getEntities(criminal.loc[0].content)
eliminate_duplicates(entities)

['san pedro sula',
 'lomas del carmen',
 'monumento a la madre',
 'primera avenida',
 'parque central']

In [68]:
for rel in criminal.loc[0].relationships:
    print rel

[{u'tag': u'B-Col', u'word': 'lomas del carmen'}, {u'tag': u'B-City', u'word': 'san pedro sula'}]
[{u'tag': u'B-Zone', u'word': 'monumento a la madre'}, {u'tag': u'B-City', u'word': 'san pedro sula'}]
[{u'tag': u'B-Zone', u'word': 'primera avenida'}, {u'tag': u'B-City', u'word': 'san pedro sula'}]
[{u'tag': u'B-Zone', u'word': 'parque central'}, {u'tag': u'B-City', u'word': 'san pedro sula'}]


In [38]:
def get_singles(entities, relationships):
    in_relationship = []
    for rel in relationships:
        in_relationship.append(rel[0]["word"])
        in_relationship.append(rel[1]["word"])
        
    singles =[]  
    for entity in entities:
        if entity not in in_relationship:
            singles.append(entity)
            
    return singles

get_singles(entities, criminal.loc[0].relationships)

[]

In [51]:
def sort_relationships(relationships):
    level_2 = []
    level_3 = []
    level_4 = []
    
    for rel in relationships:
        if rel[0]["tag"] in ["B-Col","B-Zone","B-Res","B-Bar"]:
            level_4.append(rel)
        elif rel[0]["tag"] == "B-City":
            level_3.append(rel)
        elif rel[0]["tag"] == "B-State":
            level_2.append(rel)
            
    return level_2 + level_3 + level_4
sort_relationships(criminal.loc[0].relationships)[0]           

[{u'tag': u'B-Col', u'word': u'lomas del carmen'},
 {u'tag': u'B-City', u'word': u'san pedro sula'}]

In [44]:
def find_parent( node, relationships ):
    for idx, rel in enumerate(relationships):
        if node == rel[0]:
            return rel[1], idx
    return None, -1

find_parent( {u'tag': u'B-Col', u'word': u'lomas del carmen'}, criminal.loc[0].relationships )

({u'tag': u'B-City', u'word': u'san pedro sula'}, 0)

In [69]:
def extract_address(article):
    
    #get entities
    entities = getEntities(article.content)
    entities = eliminate_duplicates(entities)
    
    #get singles
    addresses = get_singles(entities, article.relationships)
    
    #find paths...
    relationships = article.relationships
    visited = []
    #sort so that the function goes first to level 4 
    relationships = sort_relationships(relationships)
    for idx, rel in enumerate(relationships):
        if idx not in visited:
            visited.append(idx)
            address = rel[0]["word"] + ", " + rel[1]["word"]
            parent = rel[1]
            
            safe_guard = 0
            while parent != None or safe_guard < 5:
                parent, p_idx =  find_parent( parent, relationships )
                if parent != None:
                    address += ", " + parent["word"]
                    visited.append(p_idx)
                safe_guard += 1
                
            addresses.append(address)
            
    return addresses
                
        
extract_address(criminal.loc[3])   

['puerto cortes, cortes',
 'choloma, cortes',
 'lopez arellano, puerto cortes, cortes',
 'trincheras, puerto cortes, cortes']

In [73]:
addresses  = []
for _, row in criminal.iterrows():
    article_addrs = extract_address(row)
    addresses.append(article_addrs)
    
criminal.loc[:,"addresses"] = addresses
criminal.loc[:,"num_addresses"] = criminal.addresses.apply(lambda x: len(x))
criminal.head()

Unnamed: 0,article_id,title,content,relationships,addresses,num_addresses
0,3,"[[Citan, none], [a, none], [11, none], [person...","[[[SAN, B-City], [PEDRO, I-City], [SULA, I-Cit...","[[{u'tag': u'B-Col', u'word': u'lomas del carm...","[lomas del carmen, san pedro sula, monumento a...",4
1,5,"[[DEI, none], [pide, none], [denunciar, none],...","[[[TEGUCIGALPA, B-City]], [[-, none], [Autorid...",[],[tegucigalpa],1
2,9,"[[Alcaldía, none], [intensifica, none], [opera...","[[[TEGUCIGALPA, B-City]], [[-, none], [Pese, n...","[[{u'tag': u'B-Col', u'word': u'kennedy'}, {u'...","[kennedy, tegucigalpa, kennedy, distrito central]",2
3,11,"[[Pasajeros, none], [asaltantes, none], [acrib...","[[[PUERTO, B-City], [CORTES, I-City], [,, none...","[[{u'tag': u'B-City', u'word': u'puerto cortes...","[puerto cortes, cortes, choloma, cortes, lopez...",4
4,12,"[[Fallece, none], [comerciante, none], [olanch...","[[[JUTICALPA, B-City], [,, none], [Olancho, B-...","[[{u'tag': u'B-City', u'word': u'juticalpa'}, ...","[olancho, honduras, juticalpa, olancho, hondur...",3


# Analyze Old vs New data

In [74]:
criminal[criminal["article_id"] < 2000].tail()

Unnamed: 0,article_id,title,content,relationships,addresses,num_addresses
234,1969,"[[Después, none], [de, none], [20, none], [hor...","[[[La, none], [cuarta, none], [víctima, none],...","[[{u'tag': u'B-Zone', u'word': u'instituto jes...","[tegucigalpa, instituto jesus aguilar paz, com...",5
235,1970,"[[Peligrosos, none], [mareros, none], [en, non...","[[[A, none], [pesar, none], [que, none], [la, ...","[[{u'tag': u'B-Zone', u'word': u'centro penal'...","[tamara, el porvenir, francisco morazan, centr...",4
236,1975,"[[Madre, none], [de, none], [centralista, none...","[[[“, none], [Mi, none], [hija, none], [aspira...","[[{u'tag': u'B-Col', u'word': u'las vegas del ...","[comayaguela, honduras, las vegas del countryl...",4
237,1985,"[[Identificados, none], [vehículos, none], [y,...","[[[Las, none], [autoridades, none], [de, none]...",[],"[perpetuo socorro, hato de enmedio, colon]",3
238,1991,"[[Matan, none], [a, none], [regidor, none], [d...","[[[Un, none], [regidor, none], [municipal, non...","[[{u'tag': u'B-City', u'word': u'jocon'}, {u't...","[yoro, honduras, jocon, yoro, honduras, el ach...",3


In [85]:
print criminal.shape[0]
print criminal.loc[:238].shape[0], (criminal.loc[:238].shape[0]*100.0/criminal.shape[0])
print criminal.loc[239:].shape[0], (criminal.loc[239:].shape[0]*100.0/criminal.shape[0])

309
239 77.3462783172
70 22.6537216828


In [97]:
def get_dist_addrs(df):
    total = df.shape[0]
    print total
    print df.num_addresses.mean()
    print df.num_addresses.value_counts().sort_index() * 100.0 / total
    
print "- Total"
get_dist_addrs(criminal) 
print
print "- Old"
get_dist_addrs(criminal.loc[:238])  
print
print "- New"
get_dist_addrs(criminal.loc[239:])  

- Total
309
3.29449838188
0      6.148867
1     15.857605
2     21.035599
3     22.653722
4     13.915858
5      8.414239
6      2.912621
7      3.236246
8      1.294498
9      0.323625
10     1.294498
11     0.647249
12     0.970874
15     0.647249
17     0.647249
Name: num_addresses, dtype: float64

- Old
239
3.28451882845
0      7.112971
1     17.573222
2     19.665272
3     21.757322
4     12.552301
5      7.949791
6      3.765690
7      3.765690
8      1.255230
9      0.418410
10     1.255230
11     0.836820
12     0.418410
15     0.836820
17     0.836820
Name: num_addresses, dtype: float64

- New
70
3.32857142857
0      2.857143
1     10.000000
2     25.714286
3     25.714286
4     18.571429
5     10.000000
7      1.428571
8      1.428571
10     1.428571
12     2.857143
Name: num_addresses, dtype: float64
