# Probabilistic retrieval model
Compare TfIdf vector space model with Okapi bm25 adding terms enrichment to queries.

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [None]:
import pymongo
from twitter import TwitterDataset, ENTITY, DOMAIN
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from string import punctuation

In [None]:
db = pymongo.MongoClient()['twitter']['tweets']
tdata = TwitterDataset(db)

In [None]:
stopw = set(stopwords.words('english'))
tokenizer = TweetTokenizer()
tokenize = lambda text: [word for word in tokenizer.tokenize(text.lower()) 
                         if word not in punctuation and word not in stopw and not word.startswith('http')] 

## Specificity score
$$
\delta(w) = p(w) \log \frac{p(w)}{q(w)}
$$

In [None]:
from collections import defaultdict

In [None]:
tweets = list(db.find())
G = defaultdict(lambda: 0)
Q = defaultdict(lambda: defaultdict(lambda: 0))

for tweet in tqdm(tweets):
    try:
        text = tweet['text']
        annotations = tweet['context_annotations']
        tokens = tokenize(text)
        for token in tokens:
            G[token] += 1
            for annotation in annotations:
                domain_name = annotation['domain']['name']
                Q[domain_name][token] += 1
                entity_name = annotation['entity']['name']
                Q[entity_name][token] += 1
    except KeyError:
        pass

In [None]:
def extend_query(query, top_k=5):
    Qtot = sum(Q[query].values())
    Gtot = sum(G.values())
    terms = {}
    for word, count in Q[query].items():
        p_w = count / Qtot
        q_w = G[word] / Gtot
        terms[word] = p_w * np.log(p_w / q_w)
    return [x for x, y in sorted(terms.items(), key=lambda x: -x[1])][:top_k]

## Binary Independence Model

In [None]:
tweets = list(db.find())
R = defaultdict(lambda: 1)
N = defaultdict(lambda: 1)

query = 'COVID-19'
NumR, NumN = 0, 0

for tweet in tqdm(tweets):
    try:
        text = tweet['text']
        tweet_id = tweet['id']
        annotations = tweet['context_annotations']
        relevant = False
        for annotation in annotations:
            domain_name = annotation['domain']['name']
            entity_name = annotation['entity']['name']
            if query == domain_name or query == entity_name:
                relevant = True
                break
        NumN += 1
        NumR += 1
        if relevant:
            NumR += 1
        else:
            NumN += 1
        tokens = set(tokenize(text))
        for token in tokens:
            if relevant:
                R[token] += 1
            else:
                N[token] += 1
    except KeyError:
        pass


In [None]:
p = lambda word: R[word] / NumR
q = lambda word: N[word] / NumN

In [None]:
def bmi(query_text, doc_text):
    q_words = set(tokenize(query_text)).intersection(set(tokenize(doc_text)))
    sum_log = 0
    for q_w in q_words:
        p_score, q_score = p(q_w), q(q_w)
        score = np.log((p_score * (1 - q_score)) / (q_score * (1 - p_score)))
        sum_log += score
    return sum_log

In [None]:
query_text = 'vaccine and pandemic'
ranking = {}
for tweet_id, tweet_text in tdata.search_base:
    ranking[tweet_id] = bmi(query_text, tweet_text)

In [None]:
answers = sorted(ranking.items(), key=lambda x: -x[1])[:2]
sb = dict(tdata.search_base)
for k, score in answers:
    print(sb[k])

## The Okapi bm25 system

In [None]:
from rank_bm25 import BM25Okapi

In [None]:
tokenized_corpus = [tokenize(text) for i, text in tdata.search_base]
bm25 = BM25Okapi(tokenized_corpus)

## The TfIdf IR system

In [None]:
from search import TfIdfSearchEngine

In [None]:
E = TfIdfSearchEngine(tdata.search_base, tokenize)

## Queries

In [None]:
def get_query_environment(query, query_type=ENTITY, top_k=5):
    ground_truth = tdata.ground_truth(query, query_type=query_type)
    extension = extend_query(query, top_k=top_k)
    return extension, ground_truth

In [None]:
entity_queries = ['Business & finance', 'COVID-19', 'Boris Johnson', 'Food']
domain_queries = ['Politician', 'TV Shows', 'Athlete', 'Sports Event']

## Evaluate

In [None]:
query = entity_queries[-1]
query_type = ENTITY
extension, ground_truth = get_query_environment(query, query_type=query_type, top_k=10)
y_true = [1 if i in ground_truth else 0 for i, _ in tdata.search_base]

In [None]:
print(extension, len(ground_truth))

In [None]:
tfidf_base = E.search(query)
tfidf_ext = E.search("{} {}".format(query, " ".join(extension)))

In [None]:
tfidf_base_scores = np.zeros(len(tdata.search_base))
for i, ti, score in tfidf_base:
    tfidf_base_scores[i] = score
tfidf_ext_scores = np.zeros(len(tdata.search_base))
for i, ti, score in tfidf_ext:
    tfidf_ext_scores[i] = score

In [None]:
okapi_base_scores = bm25.get_scores(tokenize(query))
okapi_ext_scores = bm25.get_scores(tokenize("{} {}".format(query, " ".join(extension))))

## Precision and recall curve

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
import matplotlib.pyplot as plt

In [None]:
def plots(y_true, y_scores):
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    rec = list(sorted(recall, key=lambda x: x))
    prec = list(sorted(precision, key=lambda x: -x))
    iprec = [max(prec[i:]) for i in range(len(prec))]
    return rec, iprec

In [None]:
experiments = [
    ('TfIdf Base', tfidf_base_scores),
    ('TfIdf Ext', tfidf_ext_scores),
    ('Okapi Base', okapi_base_scores),
    ('Okapi Ext', okapi_ext_scores)
]

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
for elabel, escores in experiments:
    x, y = plots(y_true, escores)
    ax.plot(x, y, label=elabel)
plt.tight_layout()
plt.legend()
plt.show()

## Maximum Likelihood for bigrams

In [None]:
import nltk

In [None]:
U = defaultdict(lambda: 0)
B = defaultdict(lambda: defaultdict(lambda: 0))
for tweet_id, text in tdata.search_base:
    tokens = ['#START'] + tokenize(text) + ['#END']
    for token in tokens:
        U[token] += 1
    for a, b in nltk.ngrams(tokens, n=2):
        B[a][b] += 1

In [None]:
B['joe']['biden'] / U['joe']

In [None]:
B['biden']['#END']

In [None]:
probs = {}
for second_word, freq in B['biden'].items():
    probs[second_word] = freq / U['biden']
print(sorted(probs.items(), key=lambda x: -x[1]))