In [1]:
import pandas as pd
import numpy as np
import ast 
import unicodedata
import pickle

from sklearn.metrics import precision_recall_fscore_support

In [2]:
documents = pd.read_csv("../../files/documents.csv", )
documents.head()

Unnamed: 0,id,title,content,category,date
0,0,Auditoría revela irregularidades en el Parlacen,GUATEMALA.- Una fiscalización de la Contralorí...,Other,"28 Dic, 2009 - 7:27 pm"
1,1,Suspendidas las citas en Hospital Escuela,TEGUCIGALPA.- Una misteriosa obstrucción del s...,Other,"28 Dic, 2009 - 7:32 pm"
2,2,Mariscos contaminados alarman a los “porteños”,"PUERTO CORTES, Cortés.- Alarmados se encuentra...",Other,"28 Dic, 2009 - 8:25 pm"
3,3,Citan a 11 personas por vender pólvora,SAN PEDRO SULA.- Hasta el momento ocho bodegas...,Criminal,"28 Dic, 2009 - 8:26 pm"
4,4,Con compra de granos se paliaría hambruna en e...,TEGUCIGALPA.- No llueve hace cuatro meses y la...,Other,"29 Dic, 2009 - 1:00 am"


In [3]:
# pre-process (clean the data)
documents = documents[(documents.content.notnull() ) 
                            & (documents["category"]!="Criminal-Other")]

In [4]:
# Split
train_documents = documents.loc[:1999]
test_documents = documents.loc[2000:]

# TOPIC

In [5]:
topic_clf = pickle.load( open( "../../models/topic_model.p", "rb" ) )

In [6]:
train_preds = topic_clf.predict(train_documents)
test_preds = topic_clf.predict(test_documents)

train_true = train_documents.category.apply(lambda x: int(x=="Criminal"))
test_true = test_documents.category.apply(lambda x: int(x=="Criminal"))

In [7]:
print len(train_preds)
print len(train_true)
print len(test_preds)
print len(test_true)

train_documents.loc[:,"pred"] = train_preds
test_documents.loc[:,"pred"] = test_preds

1710
1710
479
479


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [8]:
print precision_recall_fscore_support(train_true,train_preds, average="binary")
print precision_recall_fscore_support(test_true, test_preds, average="binary")

(0.876, 0.9776785714285714, 0.92405063291139233, None)
(0.68478260869565222, 0.92647058823529416, 0.78750000000000009, None)


# POS TAG

In [9]:
from nltk.tokenize import wordpunct_tokenize, sent_tokenize

def tokenize(base_df):
    data = []
    for art_index, article in base_df.iterrows():
        sentences = sent_tokenize(unicode(article.content,"utf8"))
        sentences = [wordpunct_tokenize(sent) for sent in sentences ]
        for index in range(len(sentences)):
            sentences[index] = [ [word,"none"] for word in sentences[index]]
        # Title
        title = wordpunct_tokenize(unicode(article.title,"utf8"))
        title = [ [word,"none"] for word in title ]

        #append
        data.append([
            art_index,
            title,
            sentences
        ])
        
    return pd.DataFrame(data,columns=["article_id","title","content"])

In [10]:
filtered_train = train_documents[train_documents["pred"]==1].copy()
filtered_test = test_documents[test_documents["pred"]==1].copy()

filtered_train = tokenize(filtered_train)
filtered_test = tokenize(filtered_test)

In [11]:
tagger = pickle.load( open( "../../files/pos_tagger.p", "rb" ) )

def tagContent(content):
        for index, sentence in enumerate(content):
            words = [ word[0] for word in sentence]
            #tag
            pos_sentence = tagger.tag(words)
            #add tag to sentence
            content[index] = [ [word[0],pos_tag[1], word[1]] for word, pos_tag in zip(sentence, pos_sentence) ]
        return content
    
def tagTitle(title):
        sentence = [ word[0] for word in title]
        pos_sentence = tagger.tag(sentence)

        tagged_title = [ [word[0],pos_tag[1],word[1]] for word, pos_tag in zip(title, pos_sentence) ]

        return tagged_title
      
# pos tagging
print "tagging train"
filtered_train.loc[:,"tagged_title"] = filtered_train.title.apply(lambda title: tagTitle(title))
filtered_train.loc[:,"tagged_content"] = filtered_train.content.apply(lambda content:  tagContent(content) )

print "tagging test"
filtered_test.loc[:,"tagged_title"] = filtered_test.title.apply(lambda title: tagTitle(title))
filtered_test.loc[:,"tagged_content"] = filtered_test.content.apply(lambda content:  tagContent(content) )

print "all datasets tagged"

tagging train
tagging test
all datasets tagged


In [12]:
filtered_train.head()

Unnamed: 0,article_id,title,content,tagged_title,tagged_content
0,3,"[[Citan, none], [a, none], [11, none], [person...","[[[SAN, AQ, none], [PEDRO, NC, none], [SULA, N...","[[Citan, NC, none], [a, SP, none], [11, Z, non...","[[[SAN, AQ, none], [PEDRO, NC, none], [SULA, N..."
1,5,"[[DEI, none], [pide, none], [denunciar, none],...","[[[TEGUCIGALPA, NC, none], [.-, NC, none], [Au...","[[DEI, NC, none], [pide, VMI, none], [denuncia...","[[[TEGUCIGALPA, NC, none], [.-, NC, none], [Au..."
2,9,"[[Alcaldía, none], [intensifica, none], [opera...","[[[TEGUCIGALPA, NC, none], [.-, NC, none], [Pe...","[[Alcaldía, NC, none], [intensifica, NC, none]...","[[[TEGUCIGALPA, NC, none], [.-, NC, none], [Pe..."
3,11,"[[Pasajeros, none], [asaltantes, none], [acrib...","[[[PUERTO, NC, none], [CORTES, NC, none], [,, ...","[[Pasajeros, NC, none], [asaltantes, NC, none]...","[[[PUERTO, NC, none], [CORTES, NC, none], [,, ..."
4,12,"[[Fallece, none], [comerciante, none], [olanch...","[[[JUTICALPA, NC, none], [,, Fc, none], [Olanc...","[[Fallece, NC, none], [comerciante, NC, none],...","[[[JUTICALPA, NC, none], [,, Fc, none], [Olanc..."


# NER

In [13]:
import sys
import imp

import location.preprocess as process
imp.reload(sys.modules['location.preprocess'])

<module 'location.preprocess' from 'location/preprocess.pyc'>

In [14]:
sentences_train = process.convertToSentences(filtered_train, toObject=False)
sentences_test = process.convertToSentences(filtered_test, toObject=False)

- Convert to Sentences
- Convert to Sentences


In [15]:
words_train = process.sentenecesToWords(sentences_train)
words_test = process.sentenecesToWords(sentences_test)

words_train = words_train[words_train["word"] != "."]
words_test = words_test[words_test["word"] != "."]

- Sentences to Words
- Sentences to Words


In [16]:
words_train.head(10)

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,iob,geo_type
0,3,0,0,0,SAN,AQ,none,O,none
1,3,0,0,1,PEDRO,NC,none,O,none
2,3,0,0,2,SULA,NC,none,O,none
3,3,0,0,3,.-,NC,none,O,none
4,3,0,0,4,Hasta,SP,none,O,none
5,3,0,0,5,el,DA,none,O,none
6,3,0,0,6,momento,NC,none,O,none
7,3,0,0,7,ocho,DN,none,O,none
8,3,0,0,8,bodegas,NC,none,O,none
9,3,0,0,9,han,VAI,none,O,none


In [36]:
final_model = pickle.load( open( "../../models/NER_model.p", "rb" ) )
final_model

{'le_iob': LabelEncoder(),
 'le_tag': LabelEncoder(),
 'lv1': [<location.classifiers.Classifier instance at 0x1115aa560>,
  <location.classifiers.Classifier instance at 0x112a445a8>,
  <location.classifiers.Classifier instance at 0x112a44d40>],
 'lv2': <location.classifiers.Classifier instance at 0x112a49ab8>,
 'lv3': <location.classifiers.Classifier instance at 0x11110bb90>}

In [18]:
features_train, _, _ = process.getFeatures(
    words_train.copy(), le_tag= final_model["le_tag"], le_iob= final_model["le_iob"], root_folder="../../")
features_test, _, _ = process.getFeatures(
    words_test.copy(), le_tag= final_model["le_tag"], le_iob= final_model["le_iob"], root_folder="../../")

- getting features
-- iob tag classes
['B' 'O']
- getting features
-- iob tag classes
['B' 'O']


In [19]:
features_test.isnull().any()

art_id                 False
sent_id                False
cs_id                  False
pos                    False
word                   False
pos_tag                False
iob_tag                False
iob                    False
geo_type               False
upper                  False
upper_prev1            False
upper_next1            False
first                  False
first_sent             False
size                   False
prev_1                 False
prev_2                 False
next_1                 False
next_2                 False
triggerstate1          False
triggerstate2          False
triggercity1           False
triggercity2           False
triggerzone1           False
triggerzone2           False
triggercolonia1        False
triggercolonia2        False
triggerbarrio1         False
triggerbarrio2         False
triggerresidencial1    False
triggerresidencial2    False
trigger_1              False
trigger_2              False
sent_size              False
in_Country    

In [20]:
desc_columns = ["art_id","sent_id", "cs_id","word", "iob_tag","iob" ,"geo_type"]
X_train = features_train.drop(desc_columns, 1).values
X_test = features_test.drop(desc_columns, 1).values

#lvl 1 classifier
stack_features_train = features_train.copy()  #[desc_columns]
stack_features_test = features_test.copy() #[desc_columns]
for idx, clf in enumerate(final_model["lv1"]):
    preds_train = clf.predict(X_train)
    preds_test = clf.predict(X_test)
    
    stack_features_train.loc[:,"pred_"+str(idx)] = preds_train
    stack_features_test.loc[:,"pred_"+str(idx)] = preds_test

## LV2 classifier

In [26]:
import location.identifier as identifier
imp.reload(sys.modules['location.identifier'])

<module 'location.identifier' from 'location/identifier.pyc'>

In [22]:
stack_features_train = identifier.get_stacked_features(stack_features_train, final_model["lv1"])
stack_features_test = identifier.get_stacked_features(stack_features_test, final_model["lv1"])

X_train = stack_features_train.drop(desc_columns, 1).values
X_test = stack_features_test.drop(desc_columns, 1).values

In [23]:
preds_train = final_model["lv2"].predict(X_train)
preds_test = final_model["lv2"].predict(X_test)

In [25]:
preds_train[:40]

array([0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])

## LV3 classifier

In [37]:
# . Prepare to classify JUST the entities
stack_features_train["s_pred"] = preds_train
stack_features_test["s_pred"] = preds_test

cat_train = identifier.getOnlyB(stack_features_train)
cat_test = identifier.getOnlyB(stack_features_test)

X_train = cat_train.drop(desc_columns, 1).values
y_train = cat_train.iob_tag.apply(lambda x: x[2:]).values

X_test = cat_test.drop(desc_columns, 1).values
y_test = cat_test.iob_tag.apply(lambda x: x[2:]).values

In [38]:
final_preds_train = final_model["lv3"].predict(X_train)
final_preds_test = final_model["lv3"].predict(X_test)

In [39]:
final_preds_train

array([u'City', u'City', u'City', ..., u'City', u'Zone', u'Zone'], dtype=object)

## Extract entities

In [30]:
from  location.entities import extract_entities, score_entities
imp.reload(sys.modules['location.entities'])

<module 'location.entities' from 'location/entities.pyc'>

In [40]:
geo_preds_train = pd.Series(final_preds_train, index = cat_train.index )
geo_preds_test = pd.Series(final_preds_test, index = cat_test.index )

final_train = stack_features_train.copy()
final_test = stack_features_test.copy()

final_train.loc[:,"pred_geo"] = geo_preds_train
final_test.loc[:,"pred_geo"] = geo_preds_test

# --Doesn't Apply--
# true_ent_train = extract_entities(features_train,"iob", "geo_type")
# true_ent_test = extract_entities(features_test ,"iob", "geo_type")

train_entities = extract_entities(final_train,"s_pred", "pred_geo")
test_entities = extract_entities(final_test,"s_pred", "pred_geo")

In [42]:
test_entities

Unnamed: 0,art_id,sent_id,cs_id,pos,entity,geo_type
0,2023,0,0,0,San José,City
1,2023,0,0,38,Costa Rica,Country
2,2023,0,0,72,El Laurel,Zone
3,2023,0,0,90,Panamá,Country
4,2023,0,0,132,Morgue,Zone
5,2027,0,2,24,Jocón,City
6,2027,0,2,26,Yoro,State
7,2027,0,2,31,Honduras,Country
8,2027,0,2,63,El Achiote,City
9,2027,0,2,70,La Dalia,City


# RELATIONSHIPS

In [43]:
def to_ascii(s):
    return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')

train_entities.loc[:,"entity"] = train_entities.entity.apply(lambda x: to_ascii(unicode(x).lower() ) )
test_entities.loc[:,"entity"] = test_entities.entity.apply(lambda x: to_ascii(unicode(x).lower()) )

In [47]:
final_preds_train

array([u'City', u'City', u'City', ..., u'City', u'Zone', u'Zone'], dtype=object)

In [62]:
def add_results_to_words(words_df):
    new_words = words_df[["art_id","sent_id","cs_id","pos","word", "s_pred"]].copy()
    new_words.loc[:"word"] = new_words.word.apply(lambda x: to_ascii(x.lower()) )
    new_words.loc[:,"iob_tag"] = words_df.pred_geo.apply(lambda x: x if not pd.isnull(x) else "Other")
    new_words.loc[:,"iob"] = new_words.iob_tag.apply(lambda x: "B" if x!="Other" else "O")
    #correct by adding the respective I
    last_iob = "O"
    for idx, row in new_words.iterrows():
        if row.iob == "O" and row.s_pred == 0 and ( last_iob == "B" or last_iob == "I" ):
            new_words.loc[idx,"iob"] = "I"
        last_iob = new_words.loc[idx].iob
    

    return new_words
#tagged words
twords_train = add_results_to_words(final_train)
twords_test = add_results_to_words(final_test)

twords_train.head()  #[(twords_train["art_id"]==3)&(twords_train["iob"]=="B")]

ValueError: Must have equal len keys and value when setting with an iterable

In [56]:
twords_test

Unnamed: 0,art_id,sent_id,cs_id,pos,word,s_pred,iob_tag,iob
0,2023,0,0,0,San,0,City,B
1,2023,0,0,1,José,0,Other,I
2,2023,0,0,2,",",1,Other,O
3,2023,0,0,3,(,1,Other,O
4,2023,0,0,4,ACAN,1,Other,O
5,2023,0,0,5,-,1,Other,O
6,2023,0,0,6,EFE,1,Other,O
7,2023,0,0,7,).-,1,Other,O
8,2023,0,0,8,Un,1,Other,O
9,2023,0,0,9,hombre,1,Other,O


In [57]:
def get_distance_df(df):
    
    artid = -1
    data = []
    word_count = 0
    for _, word in df.iterrows():
        if word.iob == "B":
            data.append([word.word.lower(), word.iob_tag, word.sent_id, word.pos, 0])
        elif word.iob == "I":
            data[-1][0] += " " + word.word.lower()
        elif len(data) > 0:
            data[-1][-1] += 1
            
    entity_df = pd.DataFrame(data, columns=["entity", "tag", "sent_id", "pos", "next_words"])
    return entity_df 
            
            
get_distance_df(twords_train[twords_train["art_id"]==3])         

Unnamed: 0,entity,tag,sent_id,pos,next_words
0,san pedro sula,City,0,0,26
1,san pedro sula,City,0,29,16
2,san pedro sula,City,0,49,38
3,lomas,Col,0,91,1
4,carmen,City,0,93,58
5,madre,Zone,0,153,47
6,san pedro sula,City,1,26,48
7,del medio,Col,1,78,14


In [58]:
def getPairs(df):
    art_id = df.values[0][0]
    
    country_entities = df[df["geo_type"]=="Country"][["geo_type","entity"]].values.tolist()
    state_entities = df[df["geo_type"]=="State"][["geo_type","entity"]].values.tolist()
    city_entities = df[df["geo_type"]=="City"][["geo_type","entity"]].values.tolist()
    zone_entities = df[df["geo_type"].isin(["Zone","Col","Bar","Res"])][["geo_type","entity"]].values.tolist()

    pairs = []
    for child in zone_entities:
        for parent in city_entities + state_entities + country_entities:
            pairs.append([art_id,child[0], child[1].lower(), parent[0], parent[1].lower(), 
                         {"word":child[1],"tag":child[0]}])

    for child in city_entities:
        for parent in  state_entities + country_entities:
            pairs.append([art_id,child[0], child[1].lower(), parent[0], parent[1].lower(), 
                          {"word":child[1],"tag":child[0]}])

    for child in state_entities:
        for parent in country_entities:
            pairs.append([art_id,child[0], child[1].lower(), parent[0], parent[1].lower(), 
                         {"word":child[1],"tag":child[0]}])
      
    relations_df = pd.DataFrame(pairs,columns=["art_id","c_tag","c_word","p_tag","p_word","child"])
    return relations_df

getPairs(train_entities[train_entities["art_id"]==3].drop_duplicates(subset=["entity"]) ) 

Unnamed: 0,art_id,c_tag,c_word,p_tag,p_word,child
0,3,Col,lomas,City,san pedro sula,"{u'tag': u'Col', u'word': u'lomas'}"
1,3,Col,lomas,City,carmen,"{u'tag': u'Col', u'word': u'lomas'}"
2,3,Zone,madre,City,san pedro sula,"{u'tag': u'Zone', u'word': u'madre'}"
3,3,Zone,madre,City,carmen,"{u'tag': u'Zone', u'word': u'madre'}"
4,3,Col,del medio,City,san pedro sula,"{u'tag': u'Col', u'word': u'del medio'}"
5,3,Col,del medio,City,carmen,"{u'tag': u'Col', u'word': u'del medio'}"


## get features

In [59]:
import relationships.preprocess as relationship

In [60]:
def preprocess_rel(words_df, entities_df):
    relationships = pd.DataFrame()
    for art_id in words_df.art_id.unique():
        try:
            if words_df[words_df["art_id"] == art_id].shape[0] == 0 or \
                entities_df[entities_df["art_id"] == art_id].shape[0] == 0:
                continue

    #         print "---- %i ----" %art_id
            # get entities distance in the article
            distance_df = get_distance_df( words_df[words_df["art_id"] == art_id] )
    #         print distance_df

            # generate possible relationship pairs
            relations_df = getPairs( entities_df[entities_df["art_id"] == art_id].drop_duplicates(subset=["entity"]) )
    #         print relations_df

            # generate features
            features_df = relationship.getFeatures(relations_df, distance_df)
            features_df.art_id = art_id

            # append the relationships to the final dataframe
            relationships = pd.concat([relationships,features_df])
        except Exception as error:
            print "exception in %i"  %art_id
            print error
            
            print distance_df
            print relations_df
            
            return None
            
    
    print "relationships size %i,%i"%(relationships.shape)
    return relationships

preprocess_rel(twords_train, train_entities)

exception in 11
min() arg is an empty sequence
          entity    tag  sent_id  pos  next_words
0  puerto cortes   City        0    0           1
1         cortés  State        0    3          26
2        choloma   City        0   30           2
3  puerto cortés   City        0   33           1
4         cortés  State        0   36          32
5          lópez    Col        0   71         246
   art_id c_tag         c_word  p_tag         p_word  \
0      11   Col          lopez   City  puerto cortes   
1      11   Col          lopez   City        choloma   
2      11   Col          lopez  State         cortes   
3      11  City  puerto cortes  State         cortes   
4      11  City        choloma  State         cortes   

                                          child distances  
0           {u'tag': u'Col', u'word': u'lopez'}        []  
1           {u'tag': u'Col', u'word': u'lopez'}        []  
2           {u'tag': u'Col', u'word': u'lopez'}        []  
3  {u'tag': u'City', u'wor

In [None]:
twords_train[(twords_train["art_id"]==1269)&(twords_train["iob"].isin(["B","I"]))]

In [None]:
train_entities[train_entities["art_id"] == 1269]

In [None]:
train_preds_df[(train_preds_df["art_id"]>=1269)]

In [None]:
train_preds_df