# Text retrieval

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [2]:
import json

In [3]:
with open('data/queries.json', 'r') as q_file:
    q = json.load(q_file)

In [4]:
query = q['queries'][0]['query']
docs = q['docs']
doc_ids, corpus = [], []
for doc_id, text in docs.items():
    doc_ids.append(doc_id)
    corpus.append(text)

In [5]:
print(query, len(docs))

World War II 7281


In [6]:
q['queries'][0]['docs'][:10]

[29597751,
 25628716,
 352387,
 58343348,
 12816539,
 47104122,
 979204,
 55238997,
 19067213,
 20844834]

## Corpus pre-processing

In [7]:
import spacy

In [8]:
nlp = spacy.load("en_core_web_sm")

In [9]:
def tokenize(text):
    tokens = []
    for sentence in nlp(text).sents:
        for token in sentence:
            if token.pos_ in ['NOUN', 'ADJ', 'ADV', 'VERB', 'PROPN']:
                tokens.append(token.text.lower())
    return tokens

In [10]:
C = []
for text in tqdm_notebook(corpus):
    C.append(tokenize(text))

HBox(children=(IntProgress(value=0, max=7281), HTML(value='')))




## Indexing

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from collections import defaultdict

In [12]:
len(corpus)

7281

In [13]:
counter = CountVectorizer()
C = counter.fit_transform(corpus)

In [None]:
C.shape

In [None]:
list(counter.vocabulary_.items())[:10]

In [None]:
rev_index = dict([(pos, word) for word, pos in counter.vocabulary_.items()])

In [None]:
counter.vocabulary_['war']

In [None]:
rev_index[48129]

In [None]:
C[:,48062].toarray()

In [None]:
tfidf = TfidfTransformer(use_idf=True)
X = tfidf.fit_transform(C)

In [None]:
X.shape

In [None]:
X[:,[48129, 48062]].toarray()

### Bag of words

In [None]:
doc = doc_ids.index('979204')
print(corpus[doc])

In [None]:
print(C[doc])

In [None]:
bow = {}
for i in np.nonzero(X[doc,:])[1]:
    bow[rev_index[i]] = {'tf': C[doc,i], 'tfidf': X[doc, i]}
BOW = pd.DataFrame(bow)

In [None]:
BOW.T.sort_values(by='tfidf', ascending=False).head(10)

## Visual example

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [None]:
V = PCA(n_components=2).fit(X.toarray())

In [None]:
Xv = V.transform(X.toarray())

In [None]:
wwii = [doc_ids.index(str(x)) for x in q['queries'][0]['docs']]
other = [i for i, x in enumerate(doc_ids) if i not in wwii]

In [None]:
query = 'Imperial Japanese Army gun aircraft'
q_vector = tfidf.transform(counter.transform([query]))
qv = V.transform(q_vector.toarray())
qv

In [None]:
fig, ax = plt.subplots(figsize=(16, 6))
ax.scatter(Xv[other,0], Xv[other,1], label='Other docs', alpha=0.4, c='#cccccc')
ax.scatter(Xv[wwii,0], Xv[wwii,1], label='WWII docs', alpha=0.4)
ax.scatter(qv[:,0], qv[:,1], label='Query: {}'.format(query), s=200, c='#000000')
plt.tight_layout()
plt.legend()
plt.savefig('/Users/alfio/Teaching/2019-20/masterdh/imgs/wwii.png')
plt.show()

## Matching

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
match = cosine_similarity(q_vector, X)

In [None]:
match.shape

In [None]:
ranking = [(doc_ids[i], x) for i, x in sorted(enumerate(match[0]), key=lambda x: -x[1])]

In [None]:
for doc, cos in ranking[:10]:
    print(doc, '\t', round(cos, 2), '\t', docs[doc][:80])

## Evaluation

In [None]:
answers = [doc_ids[i] for i, x in sorted(enumerate(match[0]), key=lambda x: -x[1]) if x > 0.03]
doc_score = dict([(doc_ids[i], x) for i, x in sorted(enumerate(match[0]), key=lambda x: -x[1]) if x > 0.03])
E, R, S = [], [], []
for doc in doc_ids:
    if doc in answers:
        R.append(1)
        S.append(doc_score[doc])
    else:
        R.append(0)
        S.append(0)
    if int(doc) in q['queries'][0]['docs']:
        E.append(1)
    else:
        E.append(0)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, precision_recall_curve, roc_curve

In [None]:
print(classification_report(E, R))

In [None]:
cm = confusion_matrix(E, R)

In [None]:
d = ConfusionMatrixDisplay(cm, display_labels=[0, 1])

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
d.plot(ax=ax, cmap='Blues', values_format='d')
plt.tight_layout()
plt.savefig('/Users/alfio/Teaching/2019-20/masterdh/imgs/cm.pdf')
plt.show()

In [None]:
precision, recall, thresholds = precision_recall_curve(E, S)

In [None]:
fpr, tpr, thresholds = roc_curve(E, S)

In [None]:
fig, ax = plt.subplots(figsize=(12, 6), ncols=2)
ax[0].plot(recall, precision)
ax[1].plot(fpr, tpr)
ax[0].set_xlabel('Recall')
ax[0].set_ylabel('Precision')
ax[0].set_title('Precision-Recall curve')
ax[1].set_xlabel('False positive rate')
ax[1].set_ylabel('True positive rate')
ax[1].set_title('ROC curve')
plt.tight_layout()
plt.savefig('/Users/alfio/Teaching/2019-20/masterdh/imgs/prcurve.pdf')
plt.show()