# A primer on text processing for Information Retrieval

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

## Build up a simple dataset to work with

In [2]:
import pymongo
from IPython.display import display

In [3]:
db = pymongo.MongoClient()['nyt']
data = db['nyt_key']

### Use keywords as queries

In [4]:
for record in data.find().limit(10):
    display(record)

{'_id': ObjectId('5b000b6635f44bd810080ac6'),
 'text_id': '4fc0279345c1498b0d18ccaf',
 'headline': {'main': 'DISFRANCHISE ALL UNFIT'},
 'text': 'DISFRANCHISE ALL UNFIT',
 'abstract': 'Voters; Proposition to Disfranchise All Unfit; "Post Natus" Letter to Editor',
 'web_url': 'http://query.nytimes.com/gst/abstract.html?res=9504EFD9133EE333A25750C2A9649D946197D6CF',
 'keywords': {'name': 'subject', 'value': 'POLITICAL'},
 'pub_date': datetime.datetime(1900, 12, 23, 0, 0)}

{'_id': ObjectId('5b000b6635f44bd810080ac7'),
 'text_id': '4fc0279345c1498b0d18cce8',
 'headline': {'main': "CITY'S REGISTERED VOTERS; The Enrollment Books Show a Total of 642,034, Decreases in Tammany Strongholds -- Democrats Say These Losses Will Be Made Good Up-Town."},
 'text': "CITY'S REGISTERED VOTERS; The Enrollment Books Show a Total of 642,034, Decreases in Tammany Strongholds -- Democrats Say These Losses Will Be Made Good Up-Town.",
 'abstract': 'Registration',
 'web_url': 'http://query.nytimes.com/gst/abstract.html?res=9802E5D6173FE433A25751C2A9669D946197D6CF',
 'keywords': {'name': 'subject', 'value': 'BROOKLYN (N. Y.) POLITICS'},
 'pub_date': datetime.datetime(1900, 10, 22, 0, 0)}

{'_id': ObjectId('5b000b6635f44bd810080ac8'),
 'text_id': '4fc0279345c1498b0d18cce8',
 'headline': {'main': "CITY'S REGISTERED VOTERS; The Enrollment Books Show a Total of 642,034, Decreases in Tammany Strongholds -- Democrats Say These Losses Will Be Made Good Up-Town."},
 'text': "CITY'S REGISTERED VOTERS; The Enrollment Books Show a Total of 642,034, Decreases in Tammany Strongholds -- Democrats Say These Losses Will Be Made Good Up-Town.",
 'abstract': 'Registration',
 'web_url': 'http://query.nytimes.com/gst/abstract.html?res=9802E5D6173FE433A25751C2A9669D946197D6CF',
 'keywords': {'name': 'subject', 'value': 'NEW YORK CITY POLITICS'},
 'pub_date': datetime.datetime(1900, 10, 22, 0, 0)}

{'_id': ObjectId('5b000b6635f44bd810080ac9'),
 'text_id': '4fc0279345c1498b0d18ccfc',
 'headline': {'main': 'SENAT0R HANNA PLEASED; Comments on China News and the Anti-Imperialists. Attempt to Establish Connection Between Philippine Troubles and the Boxers He Calls Idiocy.'},
 'text': 'SENAT0R HANNA PLEASED; Comments on China News and the Anti-Imperialists. Attempt to Establish Connection Between Philippine Troubles and the Boxers He Calls Idiocy.',
 'abstract': 'United States Commission; President John R. Proctor Declares that the Public Indorsement Grows in Strength',
 'web_url': 'http://query.nytimes.com/gst/abstract.html?res=9E06E0DE173FE433A25752C2A96E9C946197D6CF',
 'keywords': {'name': 'subject', 'value': 'CIVIL SERVICE'},
 'pub_date': datetime.datetime(1900, 8, 21, 0, 0)}

{'_id': ObjectId('5b000b6635f44bd810080aca'),
 'text_id': '4fc0279345c1498b0d18cd0d',
 'headline': {'main': "CHAIRMAN HANNA'S TALKS; Gov. Roosevelt and Senator Platt Confer with Him. Denies that He Criticised the Gover- nor's St. Paul Speech -- Senator Platt on Advisory Board."},
 'text': "CHAIRMAN HANNA'S TALKS; Gov. Roosevelt and Senator Platt Confer with Him. Denies that He Criticised the Gover- nor's St. Paul Speech -- Senator Platt on Advisory Board.",
 'abstract': 'Conference with Republican Leaders',
 'web_url': 'http://query.nytimes.com/gst/abstract.html?res=9903E4D6113BEE33A25752C3A9619C946197D6CF',
 'keywords': {'name': 'persons', 'value': 'HANNA, SENATOR MARK A.'},
 'pub_date': datetime.datetime(1900, 7, 31, 0, 0)}

{'_id': ObjectId('5b000b6635f44bd810080acb'),
 'text_id': '4fc0279345c1498b0d18cd2d',
 'headline': {'main': "THE PARTY NOMINATIONS; Leaders' Choice of Congressmen and State Legislators. THE EDICT OF MR. CROKER He Will Have None But Men Who Are Debaters -- Hopes of the Republicans."},
 'text': "THE PARTY NOMINATIONS; Leaders' Choice of Congressmen and State Legislators. THE EDICT OF MR. CROKER He Will Have None But Men Who Are Debaters -- Hopes of the Republicans.",
 'abstract': 'State and Congressional Candidates',
 'web_url': 'http://query.nytimes.com/gst/abstract.html?res=9401E6D91039E733A25750C0A96F9C946197D6CF',
 'keywords': {'name': 'organizations', 'value': 'TAMMANY HALL'},
 'pub_date': datetime.datetime(1900, 9, 3, 0, 0)}

{'_id': ObjectId('5b000b6635f44bd810080acc'),
 'text_id': '4fc0279345c1498b0d18cd2d',
 'headline': {'main': "THE PARTY NOMINATIONS; Leaders' Choice of Congressmen and State Legislators. THE EDICT OF MR. CROKER He Will Have None But Men Who Are Debaters -- Hopes of the Republicans."},
 'text': "THE PARTY NOMINATIONS; Leaders' Choice of Congressmen and State Legislators. THE EDICT OF MR. CROKER He Will Have None But Men Who Are Debaters -- Hopes of the Republicans.",
 'abstract': 'State and Congressional Candidates',
 'web_url': 'http://query.nytimes.com/gst/abstract.html?res=9401E6D91039E733A25750C0A96F9C946197D6CF',
 'keywords': {'name': 'subject', 'value': 'NEW YORK CITY POLITICS'},
 'pub_date': datetime.datetime(1900, 9, 3, 0, 0)}

{'_id': ObjectId('5b000b6635f44bd810080acd'),
 'text_id': '4fc0279345c1498b0d18cd2d',
 'headline': {'main': "THE PARTY NOMINATIONS; Leaders' Choice of Congressmen and State Legislators. THE EDICT OF MR. CROKER He Will Have None But Men Who Are Debaters -- Hopes of the Republicans."},
 'text': "THE PARTY NOMINATIONS; Leaders' Choice of Congressmen and State Legislators. THE EDICT OF MR. CROKER He Will Have None But Men Who Are Debaters -- Hopes of the Republicans.",
 'abstract': 'State and Congressional Candidates',
 'web_url': 'http://query.nytimes.com/gst/abstract.html?res=9401E6D91039E733A25750C0A96F9C946197D6CF',
 'keywords': {'name': 'subject', 'value': 'NEW YORK STATE POLITICS'},
 'pub_date': datetime.datetime(1900, 9, 3, 0, 0)}

{'_id': ObjectId('5b000b6635f44bd810080ace'),
 'text_id': '4fc0279345c1498b0d18cd2d',
 'headline': {'main': "THE PARTY NOMINATIONS; Leaders' Choice of Congressmen and State Legislators. THE EDICT OF MR. CROKER He Will Have None But Men Who Are Debaters -- Hopes of the Republicans."},
 'text': "THE PARTY NOMINATIONS; Leaders' Choice of Congressmen and State Legislators. THE EDICT OF MR. CROKER He Will Have None But Men Who Are Debaters -- Hopes of the Republicans.",
 'abstract': 'State and Congressional Candidates',
 'web_url': 'http://query.nytimes.com/gst/abstract.html?res=9401E6D91039E733A25750C0A96F9C946197D6CF',
 'keywords': {'name': 'subject', 'value': 'REPUBLICAN'},
 'pub_date': datetime.datetime(1900, 9, 3, 0, 0)}

{'_id': ObjectId('5b000b6635f44bd810080acf'),
 'text_id': '4fc0279445c1498b0d18cd61',
 'headline': {'main': "AMBASSADOR WHITE ON GERMANY'S ATTITUDE; Has No Idea, He Thinks, of Assisting in Partition of China. BETTER FEELING FOR AMERICA Some Distrust of Russia -- The Peace Conference, Mr. White Says, Accomplished More than Was Expected."},
 'text': "AMBASSADOR WHITE ON GERMANY'S ATTITUDE; Has No Idea, He Thinks, of Assisting in Partition of China. BETTER FEELING FOR AMERICA Some Distrust of Russia -- The Peace Conference, Mr. White Says, Accomplished More than Was Expected.",
 'abstract': "Arrival from Europe; Interview on China Situation and Kaiser's Speech",
 'web_url': 'http://query.nytimes.com/gst/abstract.html?res=9B05EEDA143DE433A25754C0A96E9C946197D6CF',
 'keywords': {'name': 'persons', 'value': 'WHITE, ANDREW D.'},
 'pub_date': datetime.datetime(1900, 8, 7, 0, 0)}

# Inverted index

```
w1: [list of docs],
w2: [list of docs]
```

$q$ (e.g., `CIVIL SERVICE`)

`q: [list of words] [civil, service]`

Query strategy: $\sigma(q, d) \rightarrow Q \cap D$

### issues
1. what is a word?
2. do words have the same relevance for docs?
3. words are too much diverse

### Notes to further develop
1. tokenization
2. normalization (lemmatization, stemming, bi-/tri-grams and in general compund terms)
3. weighting

In [18]:
from collections import defaultdict

In [19]:
I = defaultdict(lambda: [])

In [20]:
for record in data.find().limit(10):
    text = ". ".join([record['text'], record['abstract']])
    tokens = text.lower().split()
    for token in set(tokens):
        I[token].append(record['text_id'])

In [28]:
q = 'THE TAMMANY HALL'
Q = q.lower().split()

In [29]:
Q

['the', 'tammany', 'hall']

In [30]:
A = defaultdict(lambda: 0)
for qtoken in Q:
    docs = I[qtoken]
    for d in docs:
        A[d] += 1

In [31]:
A

defaultdict(<function __main__.<lambda>()>,
            {'4fc0279345c1498b0d18cce8': 4,
             '4fc0279345c1498b0d18ccfc': 1,
             '4fc0279345c1498b0d18cd0d': 1,
             '4fc0279345c1498b0d18cd2d': 4,
             '4fc0279445c1498b0d18cd61': 1})