# A primer on text processing for Information Retrieval

In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

## Build up a simple dataset to work with

In [None]:
import pymongo
from IPython.display import display

In [None]:
db = pymongo.MongoClient()['nyt']
data = db['nyt_key']

### Use keywords as queries

In [None]:
p = {'$project': {'keyword': '$keywords.value', 'abstract': 1, 'text': 1, 'text_id': 1}}
g = {'$group': {'_id': '$keyword', 'count': {'$sum': 1}}}
s = {'$sort': {'count': -1}}
l = {'$limit': 10}
for record in data.aggregate([p, g, s, l]):
    print(record)

In [None]:
for record in data.find().limit(10):
    display(record)

# Inverted index

```
w1: [list of docs],
w2: [list of docs]
```

$q$ (e.g., `CIVIL SERVICE`)

`q: [list of words] [civil, service]`

Query strategy: $\sigma(q, d) \rightarrow Q \cap D$

### issues
1. what is a word?
2. do words have the same relevance for docs?
3. words are too much diverse

### Notes to further develop
1. tokenization
2. normalization (lemmatization, stemming, bi-/tri-grams and in general compund terms)
3. weighting

In [None]:
from collections import defaultdict

In [None]:
I = defaultdict(lambda: [])

In [None]:
for record in data.find().limit(10):
    text = ". ".join([record['text'], record['abstract']])
    tokens = text.lower().split()
    for token in set(tokens):
        I[token].append(record['text_id'])

In [None]:
q = 'THE TAMMANY HALL'
Q = q.lower().split()

In [None]:
Q

In [None]:
A = defaultdict(lambda: 0)
for qtoken in Q:
    docs = I[qtoken]
    for d in docs:
        A[d] += 1

In [None]:
A