In [20]:
import os
from nltk.tokenize import sent_tokenize, word_tokenize
from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf
from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler

In [21]:
def instantiate_string_matching(cleaned_text, tuples):
    
    corpus = []
    
    # add words from tuple to corpus
    for tupl in tuples:
        corpus.append(word_tokenize(tupl['prop'] + " " + tupl['value']))
    
    sentences = sent_tokenize(cleaned_text)
    
    # add words from sentences to corpus
    for sentence in sentences:
        tokenized_sent = word_tokenize(sentence)
        corpus.append(tokenized_sent)
    
    return SoftTfIdf(corpus, sim_func = JaroWinkler().get_raw_score, threshold=0.8)

In [22]:
def token_sliding_window(tokens, size):
    for i in range(len(tokens) - size + 1):
        yield tokens[i: i + size]

In [23]:
# cleaned text and a dict with keys prop and value)
def score_sentences(cleaned_text, tupl, soft_tfidf):
    
    tuple_tokens = word_tokenize(tupl['prop'] + " " + tupl['value'])
    sentences = sent_tokenize(cleaned_text)
    
    scores = []
    for sentence in sentences:
        sentence_tokens = word_tokenize(sentence)
        
        big_score = 0.0
        for window in token_sliding_window(sentence_tokens, 5):
            score = soft_tfidf.get_raw_score(window, tuple_tokens)
            if score > big_score:
                big_score = score
            
        scores.append({'sent':sentence, 'score': big_score})
    
    return scores
        

In [24]:
def score_sentence_and_set_ner(cleaned_text, tupl, soft_tfidf):
    
    scores = score_sentences(cleaned_text, tupl, soft_tfidf)
    
    big_score = select_bigger_score(scores)

    propTokens = word_tokenize(tupl['prop'])
    valueTokens = word_tokenize(tupl['value'])
    sentence_tokens = word_tokenize(big_score['sent'])
    
    # get ner annotations according to soft tf-idf measure
    kept_index_prop = [-1] * len(propTokens)
    bigger_token_score = [0.0] * len(propTokens)
    for i, prop_token in enumerate(propTokens):
        for j, token in enumerate(sentence_tokens):
            score = soft_tfidf.get_raw_score([prop_token], [token])
            if score > bigger_token_score[i]:
                bigger_token_score[i] = score
                kept_index_prop[i] = j

    kept_index_value = [-1] * len(valueTokens)
    bigger_value_score = [0.0] * len(valueTokens)
    for i, value_token in enumerate(valueTokens):
        for j, token in enumerate(sentence_tokens):
            score = soft_tfidf.get_raw_score([value_token], [token])
            if score > bigger_value_score[i]:
                bigger_value_score[i] = score
                kept_index_value[i] = j

    ner = [''] * len(sentence_tokens)
    for index, token in enumerate(sentence_tokens):
        if index in kept_index_prop:
            ner[index] = 'PROP'
            continue
        if index in kept_index_value:
            ner[index] = 'VALUE'
            continue
        ner[index] = 'O'
    return big_score, sentence_tokens, ner

In [25]:
def read_files(text_dir, structured_dir, filename):
    
    content = open(text_dir + filename, 'r') 
    text = content.read()
    text = text.replace("\n", " ").strip().rstrip()
    
    tuples = []
    content2 = open(structured_dir + filename, 'r')
    for line in content2:
        items = line.replace("\n","").replace("_"," ").split("\t:\t")
        tuples.append({'prop': items[0], 'value': items[1]})
    
    return text, tuples

In [26]:
def select_bigger_score(scores):
    return max(scores, key=lambda s:s['score'])

In [28]:
text_dir = 'data/text/'
structured_dir = 'data/structured_data/'
filenames = ['Abbeville_County,_South_Carolina', 'Acadia_Parish,_Louisiana', 'Accomack_County,_Virginia']

text, tuples = read_files(text_dir, structured_dir, filenames[0])

soft_tfidf = instantiate_string_matching(text, tuples)

for tupl in tuples:
    print(tupl)
    
    # Only the big score of the sentence
    # scores = score_sentence(text, tupl, soft_tfidf)
    # print(select_bigger_score(scores))
    
    # The big sentence score inside the window + named entity tag for each sentence token
    big_score, sentence_tokens, ner = score_sentence_and_set_ner(text, tupl, soft_tfidf)
    
    print(big_score['sent'])
    print(big_score['score'])
    print(list(zip(sentence_tokens, ner)))
    
    print('')

{'value': 'Abbeville', 'prop': 'seat wl'}
History Both Abbeville County and the county seat, Abbeville, SC, get their name from the town of Abbeville, France.
0.4365824590665676
[('History', 'O'), ('Both', 'O'), ('Abbeville', 'VALUE'), ('County', 'O'), ('and', 'O'), ('the', 'O'), ('county', 'O'), ('seat', 'PROP'), (',', 'O'), ('Abbeville', 'O'), (',', 'O'), ('SC', 'O'), (',', 'O'), ('get', 'O'), ('their', 'O'), ('name', 'O'), ('from', 'O'), ('the', 'O'), ('town', 'O'), ('of', 'O'), ('Abbeville', 'O'), (',', 'O'), ('France', 'O'), ('.', 'O')]

{'value': '25417', 'prop': 'pop'}
2010 census As of the 2010 United States Census, there were 25,417 people, 9,990 households, and 6,939 families residing in the county.
0.8210561194911626
[('2010', 'O'), ('census', 'O'), ('As', 'O'), ('of', 'O'), ('the', 'O'), ('2010', 'O'), ('United', 'O'), ('States', 'O'), ('Census', 'O'), (',', 'O'), ('there', 'O'), ('were', 'O'), ('25,417', 'VALUE'), ('people', 'PROP'), (',', 'O'), ('9,990', 'O'), ('household

History Both Abbeville County and the county seat, Abbeville, SC, get their name from the town of Abbeville, France.
0.7663551144339781
[('History', 'O'), ('Both', 'O'), ('Abbeville', 'VALUE'), ('County', 'O'), ('and', 'O'), ('the', 'O'), ('county', 'O'), ('seat', 'O'), (',', 'VALUE'), ('Abbeville', 'O'), (',', 'O'), ('SC', 'O'), (',', 'O'), ('get', 'O'), ('their', 'O'), ('name', 'PROP'), ('from', 'PROP'), ('the', 'O'), ('town', 'O'), ('of', 'O'), ('Abbeville', 'O'), (',', 'O'), ('France', 'VALUE'), ('.', 'O')]

{'value': '4.0%', 'prop': 'area percentage'}
The other members and their districts are as following:  * Charlie Stone- District 1  * John Calhoun- District 2  * Claude Thomas- District 3  * William Norris- District 4  * Oscar Klugh- District 5  * Don Campbell- District 6  Geography According to the U.S. Census Bureau, the county has a total area of 511 square miles (1,320 km2), of which 490 square miles (1,300 km2) is land and 21 square miles (54 km2) (4.0%) is water.
0.4183855

The other members and their districts are as following:  * Charlie Stone- District 1  * John Calhoun- District 2  * Claude Thomas- District 3  * William Norris- District 4  * Oscar Klugh- District 5  * Don Campbell- District 6  Geography According to the U.S. Census Bureau, the county has a total area of 511 square miles (1,320 km2), of which 490 square miles (1,300 km2) is land and 21 square miles (54 km2) (4.0%) is water.
0.44932661784112105
[('The', 'O'), ('other', 'O'), ('members', 'O'), ('and', 'O'), ('their', 'O'), ('districts', 'PROP'), ('are', 'O'), ('as', 'O'), ('following', 'O'), (':', 'O'), ('*', 'O'), ('Charlie', 'O'), ('Stone-', 'O'), ('District', 'O'), ('1', 'O'), ('*', 'O'), ('John', 'O'), ('Calhoun-', 'O'), ('District', 'O'), ('2', 'O'), ('*', 'O'), ('Claude', 'O'), ('Thomas-', 'O'), ('District', 'O'), ('3', 'VALUE'), ('*', 'O'), ('William', 'O'), ('Norris-', 'O'), ('District', 'O'), ('4', 'O'), ('*', 'O'), ('Oscar', 'O'), ('Klugh-', 'O'), ('District', 'O'), ('5', 'O'),