# Introduction to n-grams

The CISI dataset can be donwloaded at the following address: [CISI dataset](https://www.kaggle.com/datasets/dmaso01dsta/cisi-a-dataset-for-information-retrieval/code?select=CISI.REL)

In this example, we access a local, parsed version of CISI stored in MongoDb

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
from pymongo import MongoClient

In [3]:
db = MongoClient()['cisi']

## Get documents, queries and ground truth

In [4]:
from collections import defaultdict

In [5]:
documents = [(r['id'], ". ".join([r['.T'], r['.W']])) for r in db['documents'].find()]
queries = [(r['id'], r['.W']) for r in db['queries'].find()]

In [6]:
ground_truth = defaultdict(list)

for r in db['rel'].find():
    ground_truth[r['query']].append(r['doc'])

## Tokenization and Normalization using spaCy

In [13]:
import spacy
from spacy.displacy import render
nlp = spacy.load('en_core_web_lg')

In [65]:
def tokenizer(document):
    tokens = []
    for token in nlp(document):
        if token.pos_ in {'PROPN', 'NOUN', 'VERB', 'ADJ', 'ADV'}:
            tokens.append(token.lemma_.lower())
    return tokens

In [66]:
doc_id, doc = documents[0]
tokenizer(doc)[:10]

['editions',
 'dewey',
 'decimal',
 'classifications',
 'present',
 'study',
 'history',
 'dewey',
 'decimal',
 'classification']

## N-Grams

In [67]:
import nltk
from collections import defaultdict, Counter

In [68]:
def ngrams(tokens, n):
    return list(nltk.ngrams(['<SOS>'] + tokens + ['<EOS>'], n=n) )

In [69]:
bi_index = defaultdict(lambda: 0)
for doc_id, doc in documents:
    tokens = tokenizer(doc)
    for a, b in ngrams(tokens, n=2):
        bi_index[(a, b)] += 1

In [70]:
Bi = pd.DataFrame([{'w1': a, 'w2': b, 'c': x} for (a, b), x in bi_index.items()])

In [71]:
Bi['prob'] = Bi['c'] / Bi['c'].sum()

In [72]:
Bi

Unnamed: 0,w1,w2,c,prob
0,<SOS>,editions,1,0.000009
1,editions,dewey,1,0.000009
2,dewey,decimal,9,0.000085
3,decimal,classifications,1,0.000009
4,classifications,present,1,0.000009
...,...,...,...,...
78831,monograph,last,1,0.000009
78832,average,%,1,0.000009
78833,year,scientific,1,0.000009
78834,compound,publish,1,0.000009


In [73]:
uni_count = Bi.groupby('w1').sum()['c']
Wp = uni_count / uni_count.sum()

In [74]:
uni_count['new']

339

In [75]:
Wp['new'] * Wp['york']

3.950263385890812e-07

In [76]:
Bi[(Bi.w1 == 'new') & (Bi.w2 == 'york')].prob

23470    0.000114
Name: prob, dtype: float64

In [77]:
word1 = Bi.w1.values
word2 = Bi.w2.values
prob = Bi.prob
mi = {}
for i, w1 in enumerate(word1):
    try:
        w2 = word2[i]
        if uni_count[w1] > 10 and uni_count[w2] > 10:
            p = prob[i]
            pw1 = Wp[w1]
            pw2 = Wp[w2]
            mi[(w1, w2)] = np.log(p / (pw1 * pw2))
    except KeyError:
        pass
S = pd.Series(mi)

In [78]:
S.sort_values(ascending=False).head(20)

cable         television     8.597217
gross         gross          7.984112
r             d              7.976565
numerical     taxonomy       7.916973
recon         pilot          7.916973
interlibrary  loan           7.738990
ca            condensates    7.698626
scientists    engineers      7.696430
batch         mode           7.655608
aslib         cranfield      7.635806
at            all            7.606818
departmental  prestige       7.606818
dewey         decimal        7.562320
january       june           7.560298
top           priority       7.464988
universal     decimal        7.462237
above         mention        7.408748
hard          copy           7.408748
put           forward        7.319136
campus        planner        7.297934
dtype: float64