In [26]:
import pandas as pd
import numpy as np
import ast
from sklearn.metrics import f1_score, make_scorer, classification_report, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import unicodedata
from sklearn.preprocessing import LabelEncoder

# PREPROCESS

In [19]:
articles_df = pd.read_csv("../files/pos_articles.csv")
articles_df.tagged_content = articles_df.tagged_content.apply(lambda x: ast.literal_eval(x))
articles_df.head()

Unnamed: 0,article_id,relationships,tagged_title,tagged_content
0,3,"[[{u'tag': u'B-Col', u'word': u'lomas del carm...","[[u'Citan', 'NC', u'none'], [u'a', u'SP', u'no...","[[[SAN, AQ, B-City], [PEDRO, NC, I-City], [SUL..."
1,5,[],"[[u'DEI', 'NC', u'none'], [u'pide', u'VMI', u'...","[[[TEGUCIGALPA, NC, B-City]], [[-, Fg, none], ..."
2,9,"[[{u'tag': u'B-Col', u'word': u'kennedy'}, {u'...","[[u'Alcald\xeda', 'NC', u'none'], [u'intensifi...","[[[TEGUCIGALPA, NC, B-City]], [[-, Fg, none], ..."
3,11,"[[{u'tag': u'B-City', u'word': u'puerto cortes...","[[u'Pasajeros', 'NC', u'none'], [u'asaltantes'...","[[[PUERTO, NC, B-City], [CORTES, NC, I-City], ..."
4,12,"[[{u'tag': u'B-City', u'word': u'juticalpa'}, ...","[[u'Fallece', 'NC', u'none'], [u'comerciante',...","[[[JUTICALPA, NC, B-City], [,, Fc, none], [Ola..."


## Convert to Sentences

In [60]:
def articlesToSentences(df):
    #convert to sentences
    sentences = []
    cs_id = -1 #corpus sentence id
    for artid, article in df.iterrows():
        for sentid, sentence in enumerate(article.tagged_content):
            if len(sentence) == 0:
                continue
            cs_id += 1
            num_entities = 0
            for word in sentence:
                if word[2][0] == "B":
                    num_entities += 1 
            sentences.append([article.article_id,sentid,cs_id,sentence, num_entities,len(sentence)])
    
    return pd.DataFrame(sentences,columns=["art_id","sent_id","cs_id","sentence","num_entities","num_words"])
        
sentences_df = articlesToSentences(articles_df)    
sentences_df.head()

Unnamed: 0,art_id,sent_id,cs_id,sentence,num_entities,num_words
0,3,0,0,"[[SAN, AQ, B-City], [PEDRO, NC, I-City], [SULA...",1,3
1,3,1,1,"[[-, Fg, none], [Hasta, SP, none], [el, DA, no...",1,32
2,3,2,2,"[[Los, DA, none], [operativos, AQ, none], [con...",1,16
3,3,3,3,"[[Resaltó, VMI, none], [que, CS, none], [graci...",1,42
4,3,4,4,"[[Indicó, VMI, none], [que, CS, none], [tienen...",0,35


In [59]:
sentences_df[sentences_df["num_words"]==0]

Unnamed: 0,art_id,sent_id,cs_id,sentence,num_entities,num_words
9,3,9,9,[],0,0
20,5,10,20,[],0,0
39,9,18,39,[],0,0
51,11,11,51,[],0,0
71,12,19,71,[],0,0
80,13,8,80,[],0,0
101,16,20,101,[],0,0
111,20,9,111,[],0,0
125,24,13,125,[],0,0
132,30,6,132,[],0,0


## Stratified Split 

In [64]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5,shuffle=True, random_state= 773 )  #233
for train_index, test_index in skf.split( sentences_df, sentences_df.num_entities.tolist()):
    sentences_train = sentences_df.loc[train_index]
    sentences_test = sentences_df.loc[test_index]
    break #we only need one iteration since it is a simple split

    

## Convert To Words


In [24]:
def convertToWords(sentences):
    data =[]
    for _, sent in sentences.iterrows():
        for pos, word in enumerate(sent.sentence):
            data.append([sent.art_id, sent.sent_id, sent.cs_id, pos, word[0],word[1],word[2]])
    return pd.DataFrame(data,columns=["art_id","sent_id","cs_id","pos","word","pos_tag","iob_tag"])
words_train = convertToWords(sentences_train)
words_test = convertToWords(sentences_test)
words_train.head()

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag
0,3,1,1,0,-,Fg,none
1,3,1,1,1,Hasta,SP,none
2,3,1,1,2,el,DA,none
3,3,1,1,3,momento,NC,none
4,3,1,1,4,ocho,DN,none


In [25]:
words_test.head()

Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag
0,3,0,0,0,SAN,AQ,B-City
1,3,0,0,1,PEDRO,NC,I-City
2,3,0,0,2,SULA,NC,I-City
3,3,6,6,0,La,DA,none
4,3,6,6,1,Policía,NC,none


## Generate features

In [27]:
# very useful function to avoid mispellings problems.
def to_ascii(s):
    return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')

In [33]:
# finds if a word or a group of words are contained in a gazette
# words parameter is a list of 5 words where the 3rd word is the base word 
# and the others are the previous or next words
def entity_in_gazette(words,gazette):
    #use base word
    if words[2] in gazette:
        return 1
    # bigram
    elif " ".join(words[1:3]) in gazette \
        or " ".join(words[2:4]) in gazette:
        return 1
    # trigram
    elif  " ".join(words[0:3]) in gazette \
        or " ".join(words[1:4]) in gazette \
        or " ".join(words[2:5]) in gazette:
        return 1
    else:
        return 0

In [71]:
def getFeatures(df, le_iob=None, le_tag=None):
    print "- getting features"
    
    # iob
    df.loc[:,"iob_tag"] = df.iob_tag.apply(lambda x: x[0] if x!="none" else "O")
    if le_iob == None:
        le_iob = LabelEncoder()
        le_iob.fit(df.iob_tag)
    df.loc[:,"iob_tag"] = le_iob.transform(df.iob_tag)

    print "-- iob tag classes"
    print le_iob.classes_
    # tag
    if le_tag == None:
        le_tag = LabelEncoder()
        le_tag.fit(df.pos_tag)
    df.loc[:,"pos_tag"] = le_tag.transform(df.pos_tag)
    # print "-- pos tag classes"
    # print le_tag.classes_
    # If word first letter is uppercase
    df.loc[:,"upper"] = df.word.apply(lambda x: int(x[0].isupper()) )
    upper = df.upper.values.tolist()
    df.loc[:,"upper_prev1"] = [0] + upper[:-1]
    df.loc[:,"upper_next1"] =  upper[1:] + [0]
    # first word in sentence
    df.loc[:,"first"] = df.pos.apply(lambda x: int(x == 0) )
    # size
    df.loc[:,"size"] = df.word.apply(lambda x: len(x))
    # first sentence
    df.loc[:,"first_sent"] = df.sent_id.apply(lambda x: int(x==0)).values

    # TAGS
    # add tag features by shifting the tag list 
    tags = df.pos_tag.values.tolist()

    df.loc[:,"prev_1"] = tags[-1:] + tags[:-1]
    # df.at[df["pos"] < 1,'prev_1'] = -1
    df.loc[:,"prev_2"] = tags[-2:] + tags[:-2]
    df.at[df["pos"] < 2,'prev_2'] = -1

    df.loc[:,"next_1"] = tags[1:] + tags[:1]
    df.loc[:,"next_2"] = tags[2:] + tags[:2]

    # PREVIOUS WORDS (In spanish the type of location are written before the NE )
    loc_types = ["colonia", "barrio", "residencial","ciudad", 
    "aldea","zona","puente","mercado","bulevar","centro","estado"]
    words = df.word.apply(lambda x: x.lower() ).values.tolist()
    df.loc[:,"prev_prefix_1"] = words[-1:] + words[:-1]
    df.loc[:,"prev_prefix_1"]  = df.prev_prefix_1.apply(lambda x:  int( x in loc_types ))
    
    df.loc[:,"prev_prefix_2"] = words[-2:] + words[:-2]
    df.loc[:,"prev_prefix_2"]  = df.prev_prefix_2.apply(lambda x:  int( x in loc_types ))

    #Gazette features
    countries_df = pd.read_csv("../files/countries.csv", encoding="utf-8")
    dep_mun_df = pd.read_csv("../files/DepartamentosMunicipios.csv", encoding="utf-8")
    world_cities = pd.read_csv("../files/ciudades_mundo.csv", encoding="utf-8")
    #converting to ascii because of accents (tilde)
    countries_df.value = countries_df.value.apply(lambda x: to_ascii(x).lower() )
    dep_mun_df.Departamento = dep_mun_df.Departamento.apply(lambda x: to_ascii(x).lower() )
    dep_mun_df.Municipio = dep_mun_df.Municipio.apply(lambda x: to_ascii(x).lower() )
    world_cities.city = world_cities.city.apply(lambda x: to_ascii(x).lower() )

    country = []
    state = []
    city = []

    words = df.word.apply(lambda x: to_ascii(x).lower() ).values.tolist()
    prev_2 = ["",""] + words[:-2]
    prev_1 = [""] + words[:-1]
    next_1 = words[1:] + [""]
    next_2 = words[2:] + ["",""]

    #convert to dummies
    # columns = ["prev_1", "prev_2","next_1","next_2","pos_tag"]
    # df = hot_encode(df,columns)

    
    
    #Sentence Size
    sentence_size = pd.DataFrame({
        "cs_id":df.cs_id.value_counts().index.values,
        "sent_size": df.cs_id.value_counts().values
    })
    
    
    df = df.merge(sentence_size, left_on='cs_id', right_on='cs_id', how='left')

    cities_gazette = np.concatenate([dep_mun_df.Municipio.values, world_cities.city.values])

    for idx in range(len(words)):
        word_list = [prev_2[idx],prev_1[idx],words[idx],next_1[idx],next_2[idx]]
        
        country.append( entity_in_gazette(word_list,countries_df.value.values) )
        state.append( entity_in_gazette(word_list,dep_mun_df.Departamento.values) )
        city.append( entity_in_gazette(word_list, cities_gazette ) )

    df.loc[:,"in_Country"] = country
    df.loc[:,"in_State"] = state
    df.loc[:,"in_City"] = city
    
    return df, le_iob, le_tag

In [72]:
le_tag = LabelEncoder()
le_tag.fit(words_train.pos_tag.values.tolist() + words_test.pos_tag.values.tolist()  )

features_train, le_iob, _ = getFeatures(words_train.copy(), le_tag= le_tag)
features_test,_,_ = getFeatures(words_test.copy(), le_iob=le_iob, le_tag=le_tag)
features_train.head()

- getting features
-- iob tag classes
[u'B' u'I' 'O']
- getting features
-- iob tag classes
[u'B' u'I' 'O']


Unnamed: 0,art_id,sent_id,cs_id,pos,word,pos_tag,iob_tag,upper,upper_prev1,upper_next1,...,prev_1,prev_2,next_1,next_2,prev_prefix_1,prev_prefix_2,sent_size,in_Country,in_State,in_City
0,3,1,1,0,-,13,2,0,0,1,...,28,-1,34,4,0,0,32,0,0,0
1,3,1,1,1,Hasta,34,2,1,0,0,...,13,-1,4,22,0,0,32,0,0,0
2,3,1,1,2,el,4,2,0,1,0,...,34,13,22,7,0,0,32,0,0,0
3,3,1,1,3,momento,22,2,0,0,0,...,4,34,7,22,0,0,32,0,0,0
4,3,1,1,4,ocho,7,2,0,0,0,...,22,4,22,35,0,0,32,0,0,0


# Train HMM Classifier

In [109]:
def hot_encode(df,columns):
    for column in columns:
        dummies = pd.get_dummies(df[column], prefix=column)
#         df = pd.concat([df, dummies], axis=1)

#         df = df.drop(column, axis=1)
    return dummies
dummy_train = hot_encode(features_train,["pos_tag","prev_1","prev_2","next_1","next_2"])
dummy_train.loc[:,"iob_tag"] = features_train.iob_tag
dummy_train.head()

Unnamed: 0,next_2_0,next_2_1,next_2_2,next_2_3,next_2_4,next_2_5,next_2_6,next_2_7,next_2_8,next_2_9,...,next_2_42,next_2_43,next_2_44,next_2_45,next_2_46,next_2_47,next_2_48,next_2_49,next_2_50,iob_tag
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [110]:
# formats in X and y
def getXY(df, dropColumns, includeColumns=None, names=False):
#     df = addNewFeatures(df)
    if includeColumns == None:
        X = df.drop(dropColumns, 1).values
    else:
        X = df[includeColumns].values
    y = df.iob_tag.values
    # print "-- iob counts --"
    # print df.iob.value_counts()
    if names:
        features_names = df.drop(dropColumns, 1).columns
        return X, y, features_names
    return X, y

In [111]:
desc_columns = ["art_id","sent_id", "cs_id","word", "iob_tag" ]
# X_train, y_train, features_names = getXY(features_train, desc_columns, names=True)
# X_test, y_test = getXY(features_test, desc_columns)
X_train, y_train, features_names = getXY(dummy_train, ["iob_tag"], names=True)
len_train = sentences_train["num_words"].values.tolist()
# len_test = sentences_test["num_words"].values.tolist()

In [112]:
# ----------------  AQUI   --------------------#
# You need to see why these numbers aren't matching before continuing
print sum(len_train)
print features_train.shape  

55371
(55517, 23)


In [113]:
from seqlearn.hmm import MultinomialHMM
clf = MultinomialHMM(decode="bestfirst")
clf.fit(X_train, y_train, len_train)
print "- Train Results -"
preds_train = clf.predict(X_train)
print classification_report(y_train, preds_train )

# print "- Test Results -"
# preds_test = clf.predict(X_test)
# print classification_report(y_test, preds_test)

- Train Results -
             precision    recall  f1-score   support

          0       0.00      0.01      0.00       971
          1       0.01      0.95      0.02       598
          2       0.99      0.11      0.20     53948

avg / total       0.96      0.12      0.20     55517



In [105]:
# len_train[-1]
# y_train[:30]
clf.classes_
len_train[0]
X_train[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2])

In [84]:
features_train.loc[0]

art_id            3
sent_id           1
cs_id             1
pos               0
word              -
pos_tag          13
iob_tag           2
upper             0
upper_prev1       0
upper_next1       1
first             1
size              1
first_sent        0
prev_1           28
prev_2           -1
next_1           34
next_2            4
prev_prefix_1     0
prev_prefix_2     0
sent_size        32
in_Country        0
in_State          0
in_City           0
Name: 0, dtype: object

# TEST

In [88]:
text = [w.split() for w in ["this DT",
                            "is V",
                            "a DT",
                            "test N",
                            "for IN",
                            "a DT",
                            "hidden Adj",
                            "Markov N",
                            "model N"]]
words, y = zip(*text)
lengths = [len(text)]

vocab, identities = np.unique(words, return_inverse=True)
X = (identities.reshape(-1, 1) == np.arange(len(vocab))).astype(int)

In [89]:
X

array([[0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0]])