## Country & City parser

### Loading and using the parser

In [6]:
def word2vec(ls):
    return [token.vector.tolist() for token in ls]

def feature(x):
    """
    This function takes in a list of word vectors and convert to input features (eg below)
    """
    for sent in x:
        for word in sent:
            for idx, feature in enumerate(word):
                word[idx] = 'feature%s=%s' %(idx,feature)    

In [8]:
from sklearn.externals import joblib
import spacy
import pandas as pd
from IPython.display import display
import os

nlp = spacy.load('en_core_web_lg')

# data is a single column dataframe
file = os.path.join(os.getcwd(),'test_data.csv')
df = pd.read_csv(file, encoding='latin1')
display(df.head())

# preprocess the data
df['sentence'] = df['sentence'].str.lower()
df['vector'] = df['sentence'].copy()
df['vector'] = df['vector'].apply(lambda x: nlp(str(x)))
df['vector'] = df['vector'].apply(word2vec)

# convert data to model inputs
feature(df['vector'].tolist())

# load the model from disk and run it
# each parser runs seperately as compared to one after another
country_parser = joblib.load('leo_country_parser.pkl')
country_result = country_parser.predict(df['vector'].tolist())
df['country_labels'] = pd.Series(country_result).to_frame('country_labels')

city_parser = joblib.load('leo_city_parser.pkl')
city_result = city_parser.predict(df['vector'].tolist())
df['city_labels'] = pd.Series(city_result).to_frame('city_labels')

df2 = df[['sentence', 'country_labels', 'city_labels']]
df2.head()

# 'C' indicates a country
# 'Ci' indicates a city

Unnamed: 0,sentence
0,Stay in Maldives with The St. Regis Hotels. Lo...
1,Relax in Style At Hilton's Barbados Beachfront...
2,Enjoy Being on the Shores of The Gulf of Mexic...
3,Tap to Book Your Room in Puerto Rico. Enjoy a ...
4,"Luxury in Port of Spain, Trinidad. Best Rates ..."


Unnamed: 0,sentence,country_labels,city_labels
0,stay in maldives with the st. regis hotels. lo...,"[I, I, C, I, I, I, I, I, I, I, I, I, I, I, I, I]","[I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I]"
1,relax in style at hilton's barbados beachfront...,"[I, I, I, I, I, I, C, I, I, I, I, I, I]","[I, I, I, I, I, I, I, I, I, I, I, I, Ci]"
2,enjoy being on the shores of the gulf of mexic...,"[I, I, I, I, I, I, I, I, I, C, I, I, I, I, I, ...","[I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ..."
3,tap to book your room in puerto rico. enjoy a ...,"[I, I, I, I, I, I, C, C, I, I, I, I, I, I, I, ...","[I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ..."
4,"luxury in port of spain, trinidad. best rates ...","[I, I, I, I, C, I, C, I, I, I, I, I, I, I, I]","[I, I, I, I, I, I, I, I, I, I, I, I, I, I, I]"


## Supplementay code on how the parser was created =================================================================

###  Country parser training steps

1) replace country name with label <br>
2) tokenize the sentence using spacy <br>
3) re-label sentence with "C" for COUNTRY and "I" for non-COUNTRY word <br>
4) create word vector using spacy.vector <br>
5) split test-train <br>
6) train the model using Conditional random fields <br>
7) check result with classification report <br>

In [5]:
import os
import pandas as pd
import spacy
import re

# have to use large model as it greatly affects the parser accuracy
nlp = spacy.load('en_core_web_lg')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', -1)

from sklearn.grid_search import RandomizedSearchCV

In [7]:
# read in raw training data(df) and country list(country)

file = os.path.join(os.getcwd(),'country_dataset.csv')
df = pd.read_csv(file, encoding='latin1')

country_file = os.path.join(os.getcwd(), 'country_list.txt')
df_country = pd.read_csv(country_file, sep='\t')

country = df_country['COUNTRY_NAME'].tolist()

df.head()

Unnamed: 0,sentence,labelled,vectors
0,Fantastic New Holiday Lodges Available To Buy In Scotland. Book A Viewing Now!,Fantastic New Holiday Lodges Available To Buy In COUNTRY. Book A Viewing Now!,Fantastic New Holiday Lodges Available To Buy In Scotland. Book A Viewing Now!
1,Aruba north of the coast of Venezuela,Aruba north of the coast of Venezuela,Aruba north of the coast of Venezuela
2,"Together with Bonaire and Curaçao, Aruba forms a group referred to as the ABC islands","Together with Bonaire and Curaçao, Aruba forms a group referred to as the ABC islands","Together with Bonaire and Curaçao, Aruba forms a group referred to as the ABC islands"
3,"Collectively, Aruba and the other Dutch islands in the Caribbean are often called the Dutch Caribbean","Collectively, Aruba and the other Dutch islands in the Caribbean are often called the Dutch Caribbean","Collectively, Aruba and the other Dutch islands in the Caribbean are often called the Dutch Caribbean"
4,"Aruba is one of the four countries that form the Kingdom of the Netherlands, along with the Netherlands, Curaçao, and Sint Maarten; the citizens of these countries are all Dutch nationals","Aruba is one of the four countries that form the Kingdom of the Netherlands, along with the Netherlands, Curaçao, and Sint Maarten; the citizens of these countries are all Dutch nationals","Aruba is one of the four countries that form the Kingdom of the Netherlands, along with the Netherlands, Curaçao, and Sint Maarten; the citizens of these countries are all Dutch nationals"


In [4]:
# lowercase everything

country = [word.lower() for word in country]
df['sentence'] = df['sentence'].str.lower()
df['labelled'] = df['labelled'].str.lower()
df['vectors'] = df['vectors'].str.lower()

In [5]:
# 1) convert country name to COUNTRY using regex
    # match when country,/ country / country/ country*

def replace_country(text):
    for item in country:
        match = '%s,|%s$|%s\s|%s[^A-Za-z]' %(item,item,item,item)
        if re.search(match, text, re.I|re.M):
            text = re.sub(item, 'COUNTRY '*len(item.split(' ')), text)
    return text
    
df['labelled'] = df['labelled'].apply(replace_country)
df['labelled'] = df['labelled'].str.replace('  ', ' ', 5)

In [6]:
# 2) Tokenize the sentence

def to_str(row):
    return [str(word) for word in row]

df['vectors'] = df['vectors'].apply(lambda x: nlp(str(x)))
df['labelled'] = df['labelled'].apply(lambda x: nlp(str(x)))
df['labelled'] = df['labelled'].apply(to_str)

In [58]:
# check for inconsistent tokenizing
def getlen(ls):
    return len(ls)

df['labelled_len'] = df['labelled'].apply(getlen)
df['vectors_len'] = df['vectors'].apply(getlen)
remov = df[df['labelled_len'] != df['vectors_len']]

# then remove them
df.drop(remov.index.values, inplace=True)
print(len(df))

# check token length again
df['labelled_len'] = df['labelled'].apply(getlen)
df['vectors_len'] = df['vectors'].apply(getlen)
df[df['labelled_len'] != df['vectors_len']]

4022


Unnamed: 0,sentence,labelled,vectors,labelled_len,vectors_len


In [39]:
# 3) re-label the words with "I"  and  "C"

def labelling1(row):
    row = [word.replace(word, 'I') if word != 'COUNTRY' else word.replace(word, 'C') for word in row]
    return row

df['labelled'] = df['labelled'].apply(labelling1)

In [40]:
# 4) create word vectors using spacy word embedding

df['vectors2'] = df['vectors']

# assume that the order of word embedding vector does not change
def word2vec(ls):
    return [token.vector.tolist() for token in ls]

df['vectors2'] = df['vectors2'].apply(word2vec)

### Cleaning dataset complete
### Proceed to train-test split, train, test, confusion matrix

In [11]:
from sklearn.model_selection import train_test_split

### train-test-split

In [12]:
def feature(x):
    for sent in x:
        for word in sent:
            for idx, feature in enumerate(word):
                word[idx] = 'feature%s=%s' %(idx,feature)

In [13]:
# reshuffle the rows
df = df.sample(frac=1).reset_index(drop=True)

In [43]:
# 5) train-test split

train, test = train_test_split(df, test_size=0.1)

x_train = train['vectors2'].tolist()
y_train = train['labelled'].tolist()
x_test = test['vectors2'].tolist()
y_test = test['labelled'].tolist()
y_test_actual = test['vectors'].tolist()

# feature(x_train)
# feature(x_test)

print('no. of train = %s \nno.of test = %s \n ' % (len(x_train), len(x_test)))

no. of train = 3619 
no.of test = 403 
 


In [22]:
x_test[0][0][:5]

['feature0=0.2720400094985962',
 'feature1=-0.06202999874949455',
 'feature2=-0.188400000333786',
 'feature3=0.023225000128149986',
 'feature4=-0.018157999962568283']

### 6) Trainining the model

In [19]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
import scipy

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.14348520529810901,
    c2=0.083485913568447034,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(x_train, y_train)

In [20]:
labels = list(crf.classes_)
labels

y_pred = crf.predict(x_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report( y_test, y_pred, labels=sorted_labels, digits=3 ))

             precision    recall  f1-score   support

          C      0.959     0.952     0.955       585
          I      0.996     0.997     0.997      7629

avg / total      0.994     0.994     0.994      8214



In [49]:
# GridsearchCV

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(x_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 162.3min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_s...e,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False),
          fit_params={}, iid=True, n_iter=50, n_jobs=1,
          param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7febf6ebd828>, 'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7febf6ebd940>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=make_scorer(flat_f1_score, average=weighted, labels=['I', 'C']),
          verbose=1)

In [50]:
crf2 = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.14348520529810901, 'c2': 0.083485913568447034}
best CV score: 0.9306674015840242
model size: 11.57M


In [None]:
from sklearn.externals import joblib

# save the model to disk
filename = 'country_parser.pkl'
joblib.dump(crf, filename)

## error diagnostics

In [88]:
test['y_pred'] = ''

for table, result in zip(test.iterrows(), y_pred):
    test.set_value(table[0], 'y_pred', result)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [89]:
def matchlabels(df):
    return df['labelled'] == df['y_pred']
    
test['match'] =''
test['match'] = test[:10].apply(matchlabels, axis=1)

mismatchdf = test[test['match'] == False]

mismatchdf.drop('vectors2', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,sentence,labelled,vectors,labelled_len,vectors_len,y_pred,match
1268,"articles, photos and video about bosnia and herzegovina from the new york times, with reader advice on where and when to go.","[I, I, I, I, I, I, C, C, C, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I]","(articles, ,, photos, and, video, about, bosnia, and, herzegovina, from, the, new, york, times, ,, with, reader, advice, on, where, and, when, to, go, .)",25,25,"[I, I, I, I, I, I, C, I, C, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I]",False


1) southern sicily  -->  tagged "southern" as "C" instead of "I" <br>
2) bosnia and herzegovina  --> tagged "and" as "I: instead of "C" 
