In [7]:
import os
from nltk.tokenize import sent_tokenize, word_tokenize
from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf
from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler

In [73]:
def instantiate_string_matching(cleaned_text, tuples):
    
    corpus = []
    
    # add words from tuple to corpus
    for tupl in tuples:
        corpus.append(word_tokenize(tupl['prop'] + " " + tupl['value']))
    
    sentences = sent_tokenize(cleaned_text)
    
    # add words from sentences to corpus
    for sentence in sentences:
        tokenized_sent = word_tokenize(sentence)
        corpus.append(tokenized_sent)
    
    return SoftTfIdf(corpus, sim_func = JaroWinkler().get_raw_score, threshold=0.8)

In [69]:
def token_sliding_window(tokens, size):
    for i in range(len(tokens) - size + 1):
        yield tokens[i: i + size]

In [65]:
# cleaned text and a dict with keys prop and value)
def score_sentences(cleaned_text, tupl, soft_tfidf):
    
    tuple_tokens = word_tokenize(tupl['prop'] + " " + tupl['value'])
    sentences = sent_tokenize(cleaned_text)
    
    scores = []
    for sentence in sentences:
        sentence_tokens = word_tokenize(sentence)
        
        big_score = 0.0
        for window in token_sliding_window(sentence_tokens, 5):
            score = soft_tfidf.get_raw_score(window, tuple_tokens)
            if score > big_score:
                big_score = score
            
        scores.append({'sent':sentence, 'score': big_score})
    
    return scores
        

In [66]:
def read_files(text_dir, structured_dir, filename):
    
    content = open(text_dir + filename, 'r') 
    text = content.read()
    text = text.replace("\n", " ").strip().rstrip()
    
    tuples = []
    content2 = open(structured_dir + filename, 'r')
    for line in content2:
        items = line.replace("\n","").replace("_"," ").split("\t:\t")
        tuples.append({'prop': items[0], 'value': items[1]})
    
    return text, tuples

In [67]:
def select_bigger_score(scores):
    return max(scores, key=lambda s:s['score'])

In [75]:
text_dir = 'data/text/'
structured_dir = 'data/structured_data/'
filenames = ['Abbeville_County,_South_Carolina', 'Acadia_Parish,_Louisiana', 'Accomack_County,_Virginia']

text, tuples = read_files(text_dir, structured_dir, filenames[0])

soft_tfidf = instantiate_string_matching(text, tuples)

for tupl in tuples:
    print(tupl)
    scores = score_sentences(text, tupl, soft_tfidf)
    print(select_bigger_score(scores))
    print('')

{'prop': 'seat wl', 'value': 'Abbeville'}
{'score': 0.4365824590665676, 'sent': 'History Both Abbeville County and the county seat, Abbeville, SC, get their name from the town of Abbeville, France.'}

{'prop': 'pop', 'value': '25417'}
{'score': 0.8210561194911629, 'sent': '2010 census As of the 2010 United States Census, there were 25,417 people, 9,990 households, and 6,939 families residing in the county.'}

{'prop': 'web', 'value': 'www.abbevillecountysc.com'}
{'score': 0.0, 'sent': 'Abbeville County is a county located in the U.S. state of South Carolina.'}

{'prop': 'time zone', 'value': 'Eastern'}
{'score': 0.2310048554112885, 'sent': "See also  * National Register of Historic Places listings in Abbeville County, South Carolina  * Lake Russell  References   External links  *  16x16px Media related to Abbeville County, South Carolina at Wikimedia Commons  *  16x16px Geographic data related to Abbeville County, South Carolina at OpenStreetMap  *  Abbeville County's Official Website 

{'score': 0.33294892697514794, 'sent': 'History Both Abbeville County and the county seat, Abbeville, SC, get their name from the town of Abbeville, France.'}

{'prop': 'county', 'value': 'Abbeville County'}
{'score': 0.9084842179798058, 'sent': 'History Both Abbeville County and the county seat, Abbeville, SC, get their name from the town of Abbeville, France.'}

