# Terminological enrichment
The goal of terminological enrichment is to associate a set of known terms with new terms that are semantically similar. This can be used for expanding the queries.

## The IR system with relevance feedback

In [63]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [64]:
import pymongo
from twitter import TwitterDataset, ENTITY, DOMAIN
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from string import punctuation

In [65]:
db = pymongo.MongoClient()['twitter']['tweets']
tdata = TwitterDataset(db)

In [66]:
estats = tdata.entity_queries_stats
dstats = tdata.domain_queries_stats

In [90]:
estats.head(20)

Unnamed: 0_level_0,domain,tweet,query
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
781974596148793345,4427,4427,Business & finance
1220701888179359745,3985,3985,COVID-19
1066114840832356353,3074,3074,The Telegraph
1113138554496942080,3071,3071,The Daily Telegraph
1113097508572426242,2983,2983,Daily Mirror
1066122339568386048,2819,2819,The Guardian
857212166100754432,1964,1964,Boris Johnson
825047692124442624,1661,1661,Food
864931126132985856,1198,1198,Vladimir Putin
826817907946450944,714,714,Food Blogs


In [67]:
stopw = set(stopwords.words('english'))
tokenizer = TweetTokenizer()
tokenize = lambda text: [word for word in tokenizer.tokenize(text.lower()) 
                         if word not in punctuation and word not in stopw and not word.startswith('http')] 

### Naive search engine

In [68]:
from search import TfIdfSearchEngine

In [69]:
E = TfIdfSearchEngine(tdata.search_base, tokenize)

In [76]:
query = 'COVID-19'
ground_truth = tdata.ground_truth(query, query_type=ENTITY)
ranking = E.search('COVID-19 omicron vaccine pandemic')

In [87]:
tp, fp, fn, tn = TfIdfSearchEngine.feedback(ranking, 
                                            ground_truth, top_k=4500)

In [88]:
print(len(tp), len(fp), len(fn), len(tn))
print('precision', len(tp) / (len(tp) + len(fp)))
print('recall', len(tp) / (len(tp) + len(fn)))

3209 1291 771 17079
precision 0.7131111111111111
recall 0.8062814070351759


## Global methods
Do not use releance feedback but global external sources

### Wordnet

In [2]:
from nltk.corpus import wordnet as wn

In [8]:
syns = wn.synsets('football', pos=wn.NOUN)
for syn in syns:
    lemmas = syn.lemmas()
    print(syn.definition())
    print(syn.examples())
    for lemma in lemmas:
        print(lemma.name())

any of various games played with a ball (round or oval) in which two teams try to kick or carry or propel the ball into each other's goal
[]
football
football_game
the inflated oblong ball used in playing American football
[]
football


### Wikidata
Check also [ConceptNet](https://conceptnet.io/)

In [9]:
import requests

In [10]:
endpoint = 'https://query.wikidata.org/sparql'

In [13]:
sparql = """
select ?lexemeId ?lemma WHERE {
  ?lexemeId dct:language wd:Q1860;
            wikibase:lemma ?lemma.
  FILTER (regex(?lemma, '^pandemic.*'))
}
"""

In [14]:
r = requests.get(endpoint, params={'query' : sparql}, 
                 headers={'Accept' : 'application/sparql-results+json'})
data = r.json()
statements = data['results']['bindings']

In [15]:
statements

[{'lexemeId': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/L196546'},
  'lemma': {'xml:lang': 'en', 'type': 'literal', 'value': 'pandemically'}},
 {'lexemeId': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/L227058'},
  'lemma': {'xml:lang': 'en', 'type': 'literal', 'value': 'pandemic'}},
 {'lexemeId': {'type': 'uri',
   'value': 'http://www.wikidata.org/entity/L299289'},
  'lemma': {'xml:lang': 'en', 'type': 'literal', 'value': 'pandemic'}}]

## Local methods
Exploits relevance feedback in order to learn new terminology

### Specificity score
$$
\delta(w) = p(w) \log \frac{p(w)}{q(w)}
$$

In [93]:
import nltk
from collections import defaultdict

In [94]:
def ngram(tokens):
    ng = list(nltk.ngrams(tokens, n=2))
    skip = []
    for i, (a, b) in enumerate(ng):
        skip.append((a, b))
        if i < len(ng) - 1:
            skip.append((a, ng[i+1][1]))
    return skip

In [95]:
tweets = list(db.find())

In [96]:
G = defaultdict(lambda: 0)
D = defaultdict(lambda: defaultdict(lambda: 0))
E = defaultdict(lambda: defaultdict(lambda: 0))

for tweet in tqdm(tweets):
    try:
        text = tweet['text']
        annotations = tweet['context_annotations']
        tokens = tokenize(text)
        ngr = ngram(tokens)
        for token in tokens:
            G[token] += 1
            for annotation in annotations:
                domain_name = annotation['domain']['name']
                D[domain_name][token] += 1
                entity_name = annotation['entity']['name']
                E[entity_name][token] += 1
        for (a, b) in ngr:
            bgram_key = "{}_{}".format(a, b)
            G[bgram_key] += 1
            for annotation in annotations:
                domain_name = annotation['domain']['name']
                D[domain_name][bgram_key] += 1
                entity_name = annotation['entity']['name']
                E[entity_name][bgram_key] += 1
    except KeyError:
        pass


  0%|          | 0/33053 [00:00<?, ?it/s]

In [101]:
dstats.head(20)

Unnamed: 0_level_0,tweet,query
domain,Unnamed: 1_level_1,Unnamed: 2_level_1
47,10721,Brand
10,6368,Person
65,5525,Interests and Hobbies Vertical
123,3980,Ongoing News Story
45,3331,Brand Vertical
35,2961,Politician
66,2681,Interests and Hobbies Category
67,2573,Interests and Hobbies
3,2042,TV Shows
46,1988,Brand Category


In [102]:
entity = 'Joe Biden'
domain = 'Sport'

In [103]:
Etot = sum(E[entity].values())
Dtot = sum(D[domain].values())
Gtot = sum(G.values())
terms = {}
for word, count in D[domain].items():
    p_w = count / Dtot
    q_w = G[word] / Gtot
    terms[word] = p_w * np.log(p_w / q_w)

In [104]:
R = pd.Series(terms)

In [105]:
R.sort_values(ascending=False).head(20)

league            0.021281
premier_league    0.013681
premier           0.013654
hamilton          0.010031
lewis             0.009412
lewis_hamilton    0.009336
v                 0.007966
liverpool         0.007001
f1                0.005917
rugby             0.005907
max               0.005632
–_live            0.005488
rt                0.005228
league_–          0.005218
league_live       0.005139
manchester        0.005111
tottenham         0.004991
verstappen        0.004958
klopp             0.004860
champions         0.004683
dtype: float64

### Rocchio