# setup
imports and examples

In [1]:
import query_processing.query_processing_spacy as qp

In [2]:
from itertools import chain
import re

In [3]:
example = qp.example # Have Apple stocks risen 40 million and what about Google stock?
qp.get_ents(example)

['Apple', '40 million', 'Google']

In [4]:
generation_ex = 'Apple stocks have risen 35 million following the dissolution of Northwestern Mongolia'
retrieved_ex = 'Apple stocks have not changed significantly and are not related to Northwestern Mongolia.'

# general
general fns

In [5]:
# returns commonalities and differences between two lists

def compare(gen, retr):
    shared = [ge for ge in gen if ge in retr]
    diff = [ge for ge in gen if ge not in retr] + [re for re in retr if re not in gen]
    return shared, diff

In [6]:
# groups items in a list together if they are in the groups list

def group_list(l, groups):
    return qp.split_preserve(' '.join(l), groups)

In [7]:
group_list(qp.get_lemmas(generation_ex), qp.get_ents(generation_ex))

['apple',
 'stock',
 'have',
 'rise',
 '35 million',
 'follow',
 'the',
 'dissolution',
 'of',
 'Northwestern Mongolia']

In [8]:
def clean_list(l):
    return [qp.clean(i) for i in l]

# entity and word comparison

In [9]:
def compare_ents(gen, retr):
    return compare(qp.get_ents(gen), qp.get_ents(retr))

In [10]:
compare_ents(generation_ex, retrieved_ex)

(['Apple', 'Northwestern Mongolia'], ['35 million'])

In [11]:
# compares query transforms of two strings
# aka compares main points

def compare_transform(gen, retr):
    gen_t = list(chain.from_iterable(qp.query_extract(q) for q in qp.query_split(gen)))
    retr_t = list(chain.from_iterable(qp.query_extract(q) for q in qp.query_split(retr)))

    # group lemmas into entities prior to comparison
    gen_t = group_list(gen_t, clean_list(qp.get_ents(gen)))
    retr_t = group_list(retr_t, clean_list(qp.get_ents(retr)))
    
    return compare(gen_t, retr_t)

In [12]:
compare_transform(generation_ex, retrieved_ex)

(['apple', 'stock', 'northwestern mongolia'],
 ['rise',
  '35 million',
  'follow',
  'dissolution',
  'not',
  'change',
  'significantly',
  'not',
  'relate'])

# numbers comparison

In [13]:
# gets numbers + units mentioned in a text

def get_figures(text):
    text = ' '.join(qp.filter_include(text, [], ['NOUN', 'PROPN', 'NUM']))
    short = re.findall(r'\b([-+]?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d+)?(?:[eE][-+]?\d+)?)\s*([a-zA-Z%°]+)',
                       text)
    full = re.findall(r'(\d+\.?\d*(?:e[+-]?\d+)?)\s*([^\d].*?)(?=\s*\d+|$)', text)
    return list(short), list(full) # be careful to include index of whether you need short or full units

In [14]:
get_figures('12 cows went to 18 grocery stores to buy 4.8 pounds of beef and 3e12 bottles of milk.')

([('12', 'cows'), ('18', 'grocery'), ('4.8', 'pounds'), ('3e12', 'bottles')],
 [('12', 'cows'),
  ('18', 'grocery stores'),
  ('4.8', 'pounds beef'),
  ('3e12', 'bottles milk')])

In [15]:
def compare_numbers(gen, retr):
    gen_nums, retr_nums = [float(ge[0]) for ge in gen], [float(re[0]) for re in retr]
    return compare(gen_nums, retr_nums)

In [16]:
compare_numbers(get_figures('apple loses 45 billion')[0],
                get_figures('google shrinks 12.2%')[0])

([], [45.0, 12.2])

In [17]:
def compare_units(gen, retr):
    gen_u, retr_u = [ge[1] for ge in gen], [re[1] for re in retr]
    return compare(gen_u, retr_u)

In [18]:
compare_units(get_figures('apple loses 45 billion')[0],
              get_figures('google shrinks 12.2%')[0])

([], ['billion', '%'])

In [19]:
compare(get_figures('apple loses 45 billion')[0],
        get_figures('google shrinks 12.2%')[0])

([], [('45', 'billion'), ('12.2', '%')])

# metrics
output interpretation

In [20]:
compare_ex = compare_transform(generation_ex, retrieved_ex)

In [21]:
def print_comp(comp):
    print('shared:', comp[0])
    print('different:', comp[1])

In [22]:
# change this

def score(shared_pct):
        return 'idk'

In [23]:
def numerical_comp(comp):
    shared_n, diff_n = len(comp[0]), len(comp[1])
    total = shared_n + diff_n
    return {'shared' : shared_n,
            'different' : diff_n,
            'proportion' : shared_n / diff_n if diff_n else 1,
            'shared %' : shared_n / total * 100 if total else 0,
            'different %' : diff_n / total * 100 if total else 0,
            'score' : score(shared_n / total * 100 if total else 0)}

In [24]:
numerical_comp(compare_ex)

{'shared': 3,
 'different': 9,
 'proportion': 0.3333333333333333,
 'shared %': 25.0,
 'different %': 75.0,
 'score': 'idk'}