# 0. Intro

This notebook is the outline of the training algorithm for Relationship Extraction

In [9]:
import pandas as pd
import numpy as np
import ast 
import unicodedata

In [10]:
articles = pd.read_csv("../files/criminal_articles.csv")
# convert string into array
articles.loc[:,"title"] = articles.title.apply(lambda x: ast.literal_eval(x) )
articles.loc[:,"content"] = articles.content.apply(lambda x: ast.literal_eval(x) )
articles.loc[:,"relationships"] = articles.relationships.apply(lambda x: ast.literal_eval(x) )

In [11]:
#use only train data
articles = articles[articles["article_id"]<2000]
articles.tail()

Unnamed: 0,article_id,title,content,relationships
234,1969,"[[Después, none], [de, none], [20, none], [hor...","[[[La, none], [cuarta, none], [víctima, none],...","[[{u'tag': u'B-Zone', u'word': u'instituto jes..."
235,1970,"[[Peligrosos, none], [mareros, none], [en, non...","[[[A, none], [pesar, none], [que, none], [la, ...","[[{u'tag': u'B-Zone', u'word': u'centro penal'..."
236,1975,"[[Madre, none], [de, none], [centralista, none...","[[[“, none], [Mi, none], [hija, none], [aspira...","[[{u'tag': u'B-Col', u'word': u'las vegas del ..."
237,1985,"[[Identificados, none], [vehículos, none], [y,...","[[[Las, none], [autoridades, none], [de, none]...",[]
238,1991,"[[Matan, none], [a, none], [regidor, none], [d...","[[[Un, none], [regidor, none], [municipal, non...","[[{u'tag': u'B-City', u'word': u'jocón'}, {u't..."


# 1. Preprocess

## 1.1 convert to ascii

In [12]:
# very useful function to avoid mispellings problems.
# you'll thank me later
def to_ascii(s):
    return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')

def sent_to_ascii(sentences):
    for sent_ix, sent in enumerate(sentences):
        for word_ix, word in enumerate(sent):
            sentences[sent_ix][word_ix][0] =  to_ascii(word[0])
    return sentences
            
def rel_to_ascii(relationships):
    for rel_ix, rel in enumerate(relationships):
        relationships[rel_ix][0]["word"] = to_ascii(relationships[rel_ix][0]["word"])
        relationships[rel_ix][1]["word"] = to_ascii(relationships[rel_ix][1]["word"])
    return relationships
    

#Apply it to all the words 
articles.title = articles.title.apply(lambda x: sent_to_ascii([x]) if x != None else x )
articles.content = articles.content.apply(lambda x: sent_to_ascii(x) if x != None else x )
articles.relationships = articles.relationships.apply(lambda x: rel_to_ascii(x) if x != None else x )

## 1.2  Get article entities

In [13]:
def getEntities(content):
    entities = []
    for sent in content:
        for word in sent:
            if word[1][0] == "B":
                entities.append({"word": word[0].lower(),"tag":word[1][2:]})
            elif word[1][0] == "I":
                entities[-1]["word"] += " " + word[0].lower()
                
    #eliminate duplicates
    unique_entities = []
    for itm in entities:
        if itm not in unique_entities:
            unique_entities.append(itm)
                
    return unique_entities

articles.loc[:,"full_text"] = articles.title.apply(lambda x: x) + articles.content 
articles.loc[:,"entities"] = articles.full_text.apply(lambda x: getEntities(x))

## 1.3 Format relationships tags
Eliminate the string "B-" from the tags

In [14]:
def format_relationship_tags(relationships):

    for ix, rel in enumerate(relationships):
        relationships[ix][0]["tag"] = rel[0]["tag"][2:]
        relationships[ix][1]["tag"] = rel[1]["tag"][2:]
    return relationships
                          
articles.relationships.apply(lambda x: format_relationship_tags(x))
articles.head().relationships[3]

[[{u'tag': u'City', u'word': 'puerto cortes'},
  {u'tag': u'State', u'word': 'cortes'}],
 [{u'tag': u'Col', u'word': 'lopez arellano'},
  {u'tag': u'City', u'word': 'puerto cortes'}],
 [{u'tag': u'Zone', u'word': 'trincheras'},
  {u'tag': u'City', u'word': 'puerto cortes'}],
 [{u'tag': u'City', u'word': 'choloma'},
  {u'tag': u'State', u'word': 'cortes'}]]

## 1.4 Get the Pairs dataset

In [19]:
# gets entities with the distances in words between them. i.e. "next_words"
def getEntitydf(article):
    data = []
    word_count = 0
    for sent_id, sent in enumerate(article.full_text):
        for pos, word in enumerate(sent):
            if word[1][0] == "B":
                if len(data) > 0:
                    data[-1][-1] = word_count
                word_count = 0
                data.append([word[0].lower(),word[1][2:], sent_id, pos, 0 ])
            elif word[1][0] == "I":
                data[-1][0] += " " + word[0].lower()
            else:
                word_count += 1

    entity_df = pd.DataFrame(data, columns=["entity", "tag", "sent_id", "pos", "next_words"])


    return entity_df   

### 1.4.1 Get pairs function

In [89]:
def getPairs(article):
    article.entities
    country_entities = [x for x in article.entities if  x["tag"]=="Country"]
    state_entities = [x for x in article.entities if  x["tag"]=="State"]
    city_entities = [x for x in article.entities if  x["tag"]=="City"]
    zone_entities = [x for x in article.entities if  x["tag"] in ["Zone","Col","Res","Bar"]]

    pairs = []
    for child in zone_entities:
        for parent in city_entities + state_entities + country_entities:
            relation = int([child,parent] in article.relationships)
            pairs.append([article.article_id,child["tag"],child["word"],parent["tag"],parent["word"],child,parent, relation])

    for child in city_entities:
        for parent in  state_entities + country_entities:
            relation = int([child,parent] in article.relationships)
            pairs.append([article.article_id, child["tag"],child["word"],parent["tag"],parent["word"],child,parent, relation])

    for child in state_entities:
        for parent in country_entities:
            relation = int([child,parent] in article.relationships)
            pairs.append([article.article_id, child["tag"],child["word"],parent["tag"],parent["word"],child,parent, relation])
            


    relations_df = pd.DataFrame(pairs,columns=["art_id",  "c_tag","c_word","p_tag","p_word","child","parent","label"])
    relations_df.loc[:,"rel_type"] = relations_df.c_tag +["-" for x in range(relations_df.shape[0])] +relations_df.p_tag 
    return relations_df

### 1.4.2 Get features

In [90]:
def getFeatures(relations_df,entity_df):
    #get the distances for each pair
    distances = []
    num_c_entity = []
    num_p_entity = []
    c_first = []
    p_first = []
    c_first_sent = []
    p_first_sent = []
    c_title = []
    p_title = []
    for _, pair in relations_df.iterrows():
        pair_distances = []

        #Number of occurences of the same entity
        num_c = entity_df[(entity_df["entity"]== pair.c_word)&(entity_df["tag"]== pair.c_tag)].shape[0]
        num_p = entity_df[(entity_df["entity"]== pair.p_word)&(entity_df["tag"]== pair.p_tag)].shape[0]
        num_c_entity.append(num_c)
        num_p_entity.append(num_p)

        #Is it in the first entity in the whole document?
        c_fe = 0 in entity_df[(entity_df["entity"]==pair.c_word)&(entity_df["tag"]==pair.c_tag)].index
        p_fe = 0 in entity_df[(entity_df["entity"]==pair.p_word)&(entity_df["tag"]==pair.p_tag)].index
        c_first.append(c_fe)
        p_first.append(p_fe)

        #Is it in the title?
        c_t = 0 in entity_df[(entity_df["entity"]==pair.c_word)&(entity_df["tag"]==pair.c_tag)].sent_id.values
        p_t = 0 in entity_df[(entity_df["entity"]==pair.p_word)&(entity_df["tag"]==pair.p_tag)].sent_id.values
        c_title.append(c_t)
        p_title.append(p_t)

        #Is it in the first sentence?
        c_fs = 1 in entity_df[(entity_df["entity"]==pair.c_word)&(entity_df["tag"]==pair.c_tag)].sent_id.values
        p_fs = 1 in entity_df[(entity_df["entity"]==pair.p_word)&(entity_df["tag"]==pair.p_tag)].sent_id.values
        c_first_sent.append(c_fs)
        p_first_sent.append(p_fs)

        indices = entity_df[(entity_df["entity"].isin([ pair.c_word,pair.p_word ])) \
                            & (entity_df["tag"].isin([pair.c_tag,pair.p_tag]))].index

        indices = sorted(indices)
        for i in range(len(indices)-1):
            for j in range(i+1,len(indices)):
                if entity_df.loc[indices[i]].tag == entity_df.loc[indices[j]].tag:
                    continue
                entity = entity_df.loc[indices[i]].entity
                distance = entity_df.loc[indices[i]:indices[j]-1].next_words.sum()
                #counting the entities in between
                if indices[j]-1 - indices[i] > 0: 
                    distance += entity_df.loc[indices[i]+1:indices[j]-1].next_words.count()
                pair_distances.append( distance )

        distances.append(pair_distances)
        
        

    relations_df.loc[:,"distances"] = pd.Series(distances)

    #minimum distance
    relations_df.loc[:,"minDistance"] = relations_df.distances.apply(lambda x: min(x))
    relations_df.loc[:,"maxDistance"] = relations_df.distances.apply(lambda x: max(x))
    relations_df.loc[:,"num_c_entity"] = num_c_entity
    relations_df.loc[:,"num_p_entity"] = num_p_entity

    relations_df.loc[:,"c_first"] = c_first
    relations_df.loc[:,"p_first"] = p_first
    relations_df.loc[:,"c_first_sent"] = c_first_sent
    relations_df.loc[:,"p_first_sent"] = p_first_sent
    relations_df.loc[:,"c_title"] = c_title
    relations_df.loc[:,"p_title"] = p_title
    
    #title or first
    relations_df.loc[:,"p_torf"] = relations_df.p_first_sent | relations_df.p_title
    relations_df.loc[:,"c_torf"] = relations_df.c_first_sent | relations_df.c_title
    
    relations_df.loc[:,"p_first"] = p_first
    
    
    relations_df.loc[:,"min_all"] = relations_df.child.apply(lambda x:  relations_df[relations_df["child"]==x].minDistance.min() )
    relations_df.loc[:,"num_rels"] = relations_df.child.apply(lambda x:  relations_df[relations_df["child"]==x].child.count() )
   

    return relations_df 

In [91]:
# Execute the functions
relationships = pd.DataFrame()
for _, article in articles.iterrows():
    if len(article.relationships) == 0:
        continue
    entity_df = getEntitydf(article)
    relations_df = getPairs(article)
    features_df = getFeatures(relations_df,entity_df)
    features_df.loc[:,"art_id"] = article.article_id
    relationships = pd.concat([relationships,features_df])
print "relatinionships size %i,%i"%(relationships.shape)
relationships.head()

relatinionships size 1513,24


Unnamed: 0,art_id,c_tag,c_word,p_tag,p_word,child,parent,label,rel_type,distances,...,c_first,p_first,c_first_sent,p_first_sent,c_title,p_title,p_torf,c_torf,min_all,num_rels
0,3,Col,lomas del carmen,City,san pedro sula,"{u'tag': u'Col', u'word': u'lomas del carmen'}","{u'tag': u'City', u'word': u'san pedro sula'}",1,Col-City,"[82, 55, 38, 101]",...,False,True,False,True,False,False,True,False,38,1
1,3,Zone,monumento a la madre,City,san pedro sula,"{u'tag': u'Zone', u'word': u'monumento a la ma...","{u'tag': u'City', u'word': u'san pedro sula'}",1,Zone-City,"[138, 111, 94, 45]",...,False,True,False,True,False,False,True,False,45,1
2,3,Zone,primera avenida,City,san pedro sula,"{u'tag': u'Zone', u'word': u'primera avenida'}","{u'tag': u'City', u'word': u'san pedro sula'}",1,Zone-City,"[140, 113, 96, 43]",...,False,True,False,True,False,False,True,False,43,1
3,3,Zone,parque central,City,san pedro sula,"{u'tag': u'Zone', u'word': u'parque central'}","{u'tag': u'City', u'word': u'san pedro sula'}",1,Zone-City,"[142, 115, 98, 41]",...,False,True,False,True,False,False,True,False,41,1
0,9,Col,kennedy,City,tegucigalpa,"{u'tag': u'Col', u'word': u'kennedy'}","{u'tag': u'City', u'word': u'tegucigalpa'}",1,Col-City,[283],...,False,True,False,True,False,False,True,False,31,2


## 1.4.3 add gazetteer features

In [92]:
dep_mun_df = pd.read_csv("../files/DepartamentosMunicipios.csv", encoding="utf-8")

dep_mun_df= dep_mun_df.append(pd.DataFrame([
        [u"francisco morazan",u"comayaguela"],
        [u"francisco morazan",u"distrito central"], 
    ],columns= dep_mun_df.columns), ignore_index=True
    )

dep_mun_df.Departamento = dep_mun_df.Departamento.apply(lambda x: to_ascii(x).lower() )
dep_mun_df.Municipio = dep_mun_df.Municipio.apply(lambda x: to_ascii(x).lower() )
dep_mun_df.loc[:,"combined"] = dep_mun_df.Municipio + ["-" for i in range(dep_mun_df.shape[0])] + dep_mun_df.Departamento

dep_mun_df.tail()

Unnamed: 0,Departamento,Municipio,combined
295,yoro,sulaco,sulaco-yoro
296,yoro,victoria,victoria-yoro
297,yoro,yorito,yorito-yoro
298,francisco morazan,comayaguela,comayaguela-francisco morazan
299,francisco morazan,distrito central,distrito central-francisco morazan


In [93]:
world_cities = pd.read_csv("../files/ciudades_mundo.csv", encoding="utf-8")
world_cities.country = world_cities.country.apply(lambda x: to_ascii(x).lower() )
world_cities.city = world_cities.city.apply(lambda x: to_ascii(x).lower() )
world_cities.loc[:,"combined"] = world_cities.city + ["-" for i in range(world_cities.shape[0])] + world_cities.country
world_cities.head()

Unnamed: 0.1,Unnamed: 0,city,country,combined
0,0,canton,china,canton-china
1,1,tokio,japon,tokio-japon
2,2,shangai,china,shangai-china
3,3,yakarta,indonesia,yakarta-indonesia
4,4,delhi,india,delhi-india


In [94]:
preds_2_1 = []
preds_3_2 = []
preds_3_1 = []
preds = []

city_states = []
for ix, rel in relationships.iterrows():
    # 2-1
    if rel.c_tag == "State":
        #search first in Honduras
        if rel.c_word in dep_mun_df.Departamento.values and rel.p_word == "honduras":
            preds_2_1.append(1)
#         elif rel.c_word in other_states:  #This could be improved by getting the list of USA, Mexico Colombia and others
        else:
            preds_2_1.append(0)
            
    else:
        preds_2_1.append(0)
            
    # 3-2
    if rel.c_tag == "City" and  rel.p_tag =="State":
        if rel.c_word +"-"+ rel.p_word in dep_mun_df.combined.values:
            preds_3_2.append(1)
            city_states.append([rel.c_tag,rel.c_word, rel.art_id])
        else:
            preds_3_2.append(0)
    else:
        preds_3_2.append(0)
            
    
    # 3-1
    if rel.c_tag == "City"and rel.p_tag == "Country":
        #if the same entity has a relationship with a state then implicit relation
        
        if [rel.c_tag, rel.c_word,rel.art_id] in city_states:
            preds_3_1.append(0)
            
        elif rel.c_word in dep_mun_df.Municipio.values and rel.p_word == "honduras":
            
            preds_3_1.append(1)
        elif rel.c_word + "-" + rel.p_word in world_cities.combined.values:
            
            preds_3_1.append(1)
        else:
#             print rel.c_word + "-" + rel.p_word
            preds_3_1.append(0)
    
    else:
        preds_3_1.append(0)
    

        
relationships.loc[:,"state_country"] = preds_2_1
relationships.loc[:,"city_state"] = preds_3_2
relationships.loc[:,"city_country"] = preds_3_1


# 2. Train a Classifier

In [95]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, make_scorer, classification_report, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [97]:
#descriptor columns
desc_columns = [ 'art_id', 'c_word', 'p_word', 'child', 'parent', 'distances', 'label']
relationships.columns

Index([u'art_id', u'c_tag', u'c_word', u'p_tag', u'p_word', u'child',
       u'parent', u'label', u'rel_type', u'distances', u'minDistance',
       u'maxDistance', u'num_c_entity', u'num_p_entity', u'c_first',
       u'p_first', u'c_first_sent', u'p_first_sent', u'c_title', u'p_title',
       u'p_torf', u'c_torf', u'min_all', u'num_rels', u'state_country',
       u'city_state', u'city_country'],
      dtype='object')

In [98]:
#some extra pre-processing
def get_X(df, desc_columns):
    le = LabelEncoder()
    #encode c_tag and p_tag
    df.loc[:,"c_tag"] = le.fit_transform(df.c_tag)
    df.loc[:,"p_tag"] = le.fit_transform(df.p_tag)
    df.loc[:,"rel_type"] = le.fit_transform(df.rel_type)
    return df.drop(desc_columns,1).values
get_X(relationships.copy(), desc_columns)[0]

array([2, 0, 5, 38, 101, 1, 4, False, True, False, True, False, False,
       True, False, 38, 1, 0, 0, 0], dtype=object)

## 2.2 Split and Train

In [99]:
#trains a model and scores it
def train_score(model, params, X_train, y_train, X_test, y_test,labels=[1]):
    f_one_scorer = make_scorer(f1_score,labels=labels, average="weighted")
    clf = GridSearchCV(model, params, cv=5, scoring= f_one_scorer, verbose=1, n_jobs=4 )
    clf.fit(X_train, y_train)

    print clf.best_score_
    print clf.best_params_
    print clf.cv_results_['mean_train_score']
    print clf.cv_results_['mean_test_score']
        
    print "- Train Results -"
    preds_train = clf.predict(X_train)

    print classification_report(y_train, preds_train )
    
    print "- Test Results -"
    preds_test = clf.predict(X_test)
    print classification_report(y_test, preds_test)
    
    return clf, preds_train, preds_test

In [85]:
X = get_X(relationships.copy(), desc_columns)
y = relationships.label.values

skf = StratifiedKFold(n_splits=5,shuffle=True, random_state= 773 )
for train_index, test_index in skf.split( X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    break
    

parameters = {
    "n_estimators": [200],
    "max_depth": [8], # [3,4,5,8,10,20],
    "min_samples_split" : [ 4], #[4,8,10,20,40],
#     "max_features": [.3], # [.8,.5,.3,.1]
}
rfc = RandomForestClassifier(random_state= 233, n_jobs=4)
clf, preds_train, preds_test = train_score(rfc, parameters, X_train, y_train, X_test, y_test)    

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    3.7s finished


0.824492145412
{'min_samples_split': 4, 'n_estimators': 200, 'max_depth': 8}
[ 0.94045895]
[ 0.82449215]
- Train Results -
             precision    recall  f1-score   support

          0       0.92      0.99      0.95       700
          1       0.98      0.88      0.93       509

avg / total       0.95      0.94      0.94      1209

- Test Results -
             precision    recall  f1-score   support

          0       0.89      0.95      0.92       176
          1       0.92      0.84      0.88       128

avg / total       0.90      0.90      0.90       304



- Train Results -
             precision    recall  f1-score   support

          0       0.91      0.99      0.95       700
          1       0.98      0.87      0.93       509

avg / total       0.94      0.94      0.94      1209

- Test Results -
             precision    recall  f1-score   support

          0       0.86      0.94      0.90       176
          1       0.91      0.80      0.85       128

avg / total       0.88      0.88      0.88       304

In [100]:
def rel_type_scores(df):
    rel_types = [
        ["State-Country"], ["City-Country"], ["City-State"], 
    ]
    
#     rel_types = [ [r] for r in df.rel_type.unique()]
    for rel_type in rel_types:
        #get all labels of the rel_type
        true_values = df[df["rel_type"].isin(rel_type)].label.values
        preds = df[df["rel_type"].isin(rel_type)].pred.values
        
        print "-- %s --"%rel_type
        print classification_report(true_values, preds)
        
train_relationships = relationships.reset_index(drop=True).ix[train_index]
test_relationships = relationships.reset_index(drop=True).ix[test_index]


train_relationships.loc[:,"pred"] = preds_train
test_relationships.loc[:,"pred"] = preds_test


print "-Train"
rel_type_scores(train_relationships)
print
print "-Test"
rel_type_scores(test_relationships)


-Train
-- ['State-Country'] --
             precision    recall  f1-score   support

          0       0.98      1.00      0.99        40
          1       1.00      0.98      0.99        64

avg / total       0.99      0.99      0.99       104

-- ['City-Country'] --
             precision    recall  f1-score   support

          0       0.95      1.00      0.97       153
          1       1.00      0.90      0.95        82

avg / total       0.97      0.97      0.97       235

-- ['City-State'] --
             precision    recall  f1-score   support

          0       0.88      0.99      0.93       134
          1       0.99      0.87      0.93       149

avg / total       0.94      0.93      0.93       283


-Test
-- ['State-Country'] --
             precision    recall  f1-score   support

          0       1.00      0.88      0.93         8
          1       0.92      1.00      0.96        12

avg / total       0.95      0.95      0.95        20

-- ['City-Country'] --
           

# Next Steps

- **OK ** Check (somehow) the precision and recall of each relation type
- **OK ** Use gazeteers
- Review why false negatives and positives





In [101]:
# FALSE NEGATIVES
test_relationships[(test_relationships["label"]==1)&(test_relationships["pred"]==0)]


Unnamed: 0,art_id,c_tag,c_word,p_tag,p_word,child,parent,label,rel_type,distances,...,c_title,p_title,p_torf,c_torf,min_all,num_rels,state_country,city_state,city_country,pred
5,9,Col,kennedy,City,distrito central,"{u'tag': u'Col', u'word': u'kennedy'}","{u'tag': u'City', u'word': u'distrito central'}",1,Col-City,[31],...,False,False,False,False,31,2,0,0,0,0
52,46,Zone,la gloria,City,jesus de otoro,"{u'tag': u'Zone', u'word': u'la gloria'}","{u'tag': u'City', u'word': u'jesus de otoro'}",1,Zone-City,[3],...,False,False,False,False,3,9,0,0,0,0
124,66,City,miami,Country,ee.uu.,"{u'tag': u'City', u'word': u'miami'}","{u'tag': u'Country', u'word': u'ee.uu.'}",1,City-Country,"[1, 94]",...,False,False,False,False,1,11,0,0,0,0
258,301,City,caloto,State,cauca,"{u'tag': u'City', u'word': u'caloto'}","{u'tag': u'State', u'word': u'cauca'}",1,City-State,[6],...,False,False,False,False,6,2,0,0,0,0
287,389,Zone,citrico,City,el chilar,"{u'tag': u'Zone', u'word': u'citrico'}","{u'tag': u'City', u'word': u'el chilar'}",1,Zone-City,[5],...,False,False,False,False,5,4,0,0,0,0
317,445,City,buri,Country,nigeria,"{u'tag': u'City', u'word': u'buri'}","{u'tag': u'Country', u'word': u'nigeria'}",1,City-Country,"[185, 104]",...,False,True,True,False,29,4,0,0,0,0
512,865,Zone,comunidad economica europea,City,tegucigalpa,"{u'tag': u'Zone', u'word': u'comunidad economi...","{u'tag': u'City', u'word': u'tegucigalpa'}",1,Zone-City,[464],...,False,False,False,True,464,3,0,0,0,0
672,1082,City,chanmagua,Country,guatemala,"{u'tag': u'City', u'word': u'chanmagua'}","{u'tag': u'Country', u'word': u'guatemala'}",1,City-Country,"[74, 73, 47, 40]",...,False,True,True,False,16,3,0,0,0,0
715,1119,Zone,penitenciaria nacional,City,tegucigalpa,"{u'tag': u'Zone', u'word': u'penitenciaria nac...","{u'tag': u'City', u'word': u'tegucigalpa'}",1,Zone-City,[55],...,False,False,False,False,55,3,0,0,0,0
731,1131,City,langon,State,gironda,"{u'tag': u'City', u'word': u'langon'}","{u'tag': u'State', u'word': u'gironda'}",1,City-State,[6],...,False,False,False,False,6,2,0,0,0,0


In [102]:
# FALSE POSITIVES
test_relationships[(test_relationships["label"]==0)&(test_relationships["pred"]==1)]

# number 50 is something weird cause it should be positive
# 806  comayaguela vs tegucigalpa thing
# 1121 eeuu vs ee.uu. thingy
# 1356 is a mistake... it is true.
# 1925 is a mistake ... it is true.

#1354 can be avoided with a is in  world_city 

Unnamed: 0,art_id,c_tag,c_word,p_tag,p_word,child,parent,label,rel_type,distances,...,c_title,p_title,p_torf,c_torf,min_all,num_rels,state_country,city_state,city_country,pred
50,35,City,trujilo,State,colon,"{u'tag': u'City', u'word': u'trujilo'}","{u'tag': u'State', u'word': u'colon'}",0,City-State,"[295, 297, 245, 247]",...,False,False,True,False,245,1,0,0,0,1
294,394,City,santa rosa de copan,State,cortes,"{u'tag': u'City', u'word': u'santa rosa de cop...","{u'tag': u'State', u'word': u'cortes'}",0,City-State,[4],...,False,False,False,False,4,1,0,0,0,1
393,564,Zone,universidad nacional autonoma de honduras,City,suyapa,"{u'tag': u'Zone', u'word': u'universidad nacio...","{u'tag': u'City', u'word': u'suyapa'}",0,Zone-City,[1],...,False,False,False,False,1,3,0,0,0,1
492,806,Zone,centro,City,tegucigalpa,"{u'tag': u'Zone', u'word': u'centro'}","{u'tag': u'City', u'word': u'tegucigalpa'}",0,Zone-City,[37],...,False,False,True,False,37,2,0,0,0,1
680,1083,Zone,jardines,City,siguatepeque,"{u'tag': u'Zone', u'word': u'jardines'}","{u'tag': u'City', u'word': u'siguatepeque'}",0,Zone-City,[110],...,False,False,True,False,105,3,0,0,0,1
723,1121,State,colorado,Country,eeuu,"{u'tag': u'State', u'word': u'colorado'}","{u'tag': u'Country', u'word': u'eeuu'}",0,State-Country,"[2, 17]",...,False,True,True,True,1,2,0,0,0,1
847,1354,City,bogota,Country,mexico,"{u'tag': u'City', u'word': u'bogota'}","{u'tag': u'Country', u'word': u'mexico'}",0,City-Country,"[0, 10, 75, 24, 85, 14, 75, 49, 10]",...,False,True,True,True,0,7,0,0,0,1
879,1356,City,distrito central,State,francisco morazan,"{u'tag': u'City', u'word': u'distrito central'}","{u'tag': u'State', u'word': u'francisco morazan'}",0,City-State,[57],...,False,False,False,False,55,2,0,1,0,1
1476,1925,Col,mata,City,siguatepeque,"{u'tag': u'Col', u'word': u'mata'}","{u'tag': u'City', u'word': u'siguatepeque'}",0,Col-City,"[51, 50]",...,False,True,True,False,8,3,0,0,0,1
