In [2]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [171]:
import time
import math
import pandas as pd

import numpy as np
from pyserini.index import IndexReader
from pyserini.analysis import Analyzer, get_lucene_analyzer

In [29]:
index_reader = IndexReader('indexes/lucene-index-msmarco-doc')

In [4]:
def is_good(x: str):
    conditions = []
    conditions.append((not nlp.vocab[x].is_stop))  # Remove stopwords
    conditions.append((not nlp.vocab[x].is_oov))  # Remove words without vectors if any
    conditions.append((not nlp.vocab[x].is_punct)) # Remove punctuation signs
    conditions.append(not nlp.vocab[x].is_space) # Remove spaces
    conditions.append(nlp.vocab[x].is_alpha)  # Keep only alphabetical
    conditions.append(nlp.vocab[x].is_lower) # Only keep lowercase words
    return all(conditions)


vocabulary = list(x for x in nlp.vocab.strings if is_good(x))
print(len(vocabulary))

247208


In [99]:
vocabulary

['aa',
 'aaa',
 'aaaa',
 'aaaaa',
 'aaaaaa',
 'aaaaaaa',
 'aaaaaaaa',
 'aaaaaaaaa',
 'aaaaaaaaaa',
 'aaaaaaaaaaa',
 'aaaaaaaaaaaa',
 'aaaaaaaaaaaaa',
 'aaaaaaaaaaaaaa',
 'aaaaaaaaaaaaaaa',
 'aaaaaaaaaaaaaaaa',
 'aaaaaaaaaaaaaaaaa',
 'aaaaaaaaaaaaaaaaaa',
 'aaaaaaaaaaaah',
 'aaaaaaaaaaah',
 'aaaaaaaaaah',
 'aaaaaaaaah',
 'aaaaaaaaand',
 'aaaaaaaah',
 'aaaaaaaand',
 'aaaaaaah',
 'aaaaaaand',
 'aaaaaah',
 'aaaaaall',
 'aaaaaand',
 'aaaaages',
 'aaaaah',
 'aaaaahh',
 'aaaaahhh',
 'aaaaahhhh',
 'aaaaahhhhh',
 'aaaaahhhhhh',
 'aaaaall',
 'aaaaand',
 'aaaaargh',
 'aaaaaw',
 'aaaages',
 'aaaagh',
 'aaaah',
 'aaaahh',
 'aaaahhh',
 'aaaahhhh',
 'aaaahhhhh',
 'aaaahhhhhh',
 'aaaall',
 'aaaand',
 'aaaargh',
 'aaaaw',
 'aaaawww',
 'aaaawwww',
 'aaages',
 'aaagh',
 'aaah',
 'aaahh',
 'aaahhh',
 'aaahhhh',
 'aaahhhhh',
 'aaahhhhhh',
 'aaahs',
 'aaai',
 'aaall',
 'aaand',
 'aaargh',
 'aaas',
 'aaaw',
 'aaaww',
 'aaawww',
 'aaawwww',
 'aaawwwww',
 'aaay',
 'aab',
 'aabb',
 'aabout',
 'aac',
 'aacc',
 '

In [107]:
analyzer = Analyzer(get_lucene_analyzer())

In [96]:
# Gets rid of words like "aaaa" or "aaaaaaah"
def is_worth(s: str):
    df, cf = index_reader.get_term_counts(s)
    return df > 1000 and cf > 10000

filtered_vocabulary = [x for x in vocabulary if is_worth(x)]
len(filtered_vocabulary)

49143

In [97]:
filtered_vocabulary

['aa',
 'aaa',
 'aaas',
 'aaron',
 'aarons',
 'aas',
 'ab',
 'aba',
 'abandon',
 'abandone',
 'abandoned',
 'abandoner',
 'abandoners',
 'abandones',
 'abandoning',
 'abandonment',
 'abandonments',
 'abandonned',
 'abandonning',
 'abandons',
 'abas',
 'abate',
 'abated',
 'abatement',
 'abatements',
 'abates',
 'abating',
 'abbey',
 'abbeys',
 'abbi',
 'abbie',
 'abbott',
 'abbotts',
 'abbreviate',
 'abbreviated',
 'abbreviates',
 'abbreviating',
 'abbreviation',
 'abbreviations',
 'abby',
 'abbys',
 'abc',
 'abcs',
 'abdomen',
 'abdomens',
 'abdomin',
 'abdominal',
 'abdominals',
 'abduct',
 'abducted',
 'abducting',
 'abduction',
 'abductions',
 'abductive',
 'abducts',
 'abdul',
 'abe',
 'abed',
 'aberdeen',
 'aberrancy',
 'aberrant',
 'aberration',
 'aberrational',
 'aberrations',
 'abes',
 'abid',
 'abidal',
 'abidance',
 'abide',
 'abided',
 'abiders',
 'abides',
 'abiding',
 'abigail',
 'abil',
 'abilities',
 'ability',
 'abilitys',
 'abillities',
 'abillity',
 'abl',
 'ablate',

In [115]:
stemmed_vocabulary = []
stemmed_set = set()
for x in filtered_vocabulary:
    stemmed_x = analyzer.analyze(x)[0]
    if stemmed_x in stemmed_set:
        continue
    stemmed_vocabulary.append(x)
    stemmed_set.add(stemmed_x)
len(stemmed_vocabulary)

13567

In [116]:
stemmed_vocabulary

['aa',
 'aaa',
 'aaron',
 'ab',
 'aba',
 'abandon',
 'abate',
 'abbey',
 'abbi',
 'abbott',
 'abbreviate',
 'abc',
 'abdomen',
 'abdomin',
 'abduct',
 'abdul',
 'aberdeen',
 'aberrancy',
 'abid',
 'abigail',
 'abil',
 'abl',
 'ablate',
 'abnormal',
 'aboard',
 'abolish',
 'abolition',
 'abolitionist',
 'aboriginal',
 'abort',
 'abound',
 'aboute',
 'abov',
 'abraham',
 'abram',
 'abrasion',
 'abroad',
 'abrupt',
 'abruptly',
 'abscess',
 'absence',
 'absent',
 'absolut',
 'absorb',
 'absorption',
 'abstinance',
 'abstract',
 'absurd',
 'abt',
 'abu',
 'abundance',
 'abusable',
 'ac',
 'aca',
 'acad',
 'academe',
 'academi',
 'acc',
 'accelerant',
 'accent',
 'accept',
 'access',
 'accessorie',
 'accident',
 'accidental',
 'acclaim',
 'accommodate',
 'accompanied',
 'accomplish',
 'accord',
 'accordingly',
 'account',
 'accredit',
 'accrual',
 'accrue',
 'accu',
 'accumulate',
 'accuracies',
 'accurancy',
 'accusal',
 'accustom',
 'acer',
 'acetal',
 'acetaminophen',
 'aceton',
 'acetyl

In [136]:
from numba import jit

@jit(nopython=True)
def cosine_similarity_numba(u:np.ndarray, v:np.ndarray):
    assert(u.shape[0] == v.shape[0])
    uv = 0
    uu = 0
    vv = 0
    for i in range(u.shape[0]):
        uv += u[i]*v[i]
        uu += u[i]*u[i]
        vv += v[i]*v[i]
    cos_theta = 1
    if uu != 0 and vv != 0:
        cos_theta = uv/np.sqrt(uu*vv)
    return cos_theta

In [167]:
query_text = 'green card meaning'
tokenized_query = nlp(query_text)

In [179]:
def most_similar_terms(query_text):
    # start = time.time()
    tokenized_query = nlp(query_text)
    similar_words = sorted(stemmed_vocabulary, key=lambda x: sum(cosine_similarity_numba(token.vector, nlp.vocab[x].vector) for token in tokenized_query), reverse=True)
    number_of_terms = len(tokenized_query)
    res = []
    idx = 0
    while len(res) < number_of_terms:
        if similar_words[idx] not in query_text:
            res.append(similar_words[idx])
        idx += 1
    # print(time.time() - start)
    return res

In [184]:
most_similar_terms(query_text)

['blue', 'red', 'yellow']

In [135]:
# Words don't have the same importance. For example, it's not worth adding the word "blue" in the previous query
# Maybe assign idf weights to words?

total_documents = index_reader.stats()['documents']
def compute_idf(term: str):
    df, cf = index_reader.get_term_counts(term)
    return math.log(total_documents / df)

print(compute_idf('green'))
print(compute_idf('card'))
print(compute_idf('meaning'))

2.420193897174139
2.686870198832
1.2943294509143568


Meaning has the lowest IDF score in this example, but it is the most important word to expand

In [172]:
queries_doc = pd.read_csv('dev/queries.docdev.tsv', sep='\t', names=['q_id', 'text'])
queries_doc

Unnamed: 0,q_id,text
0,174249,does xpress bet charge to deposit money in you...
1,320792,how much is a cost to run disneyland
2,1090270,botulinum definition
3,1101279,do physicians pay for insurance from their sal...
4,201376,here there be dragons comic
...,...,...
5188,147073,difference between discrete and process manufa...
5189,243761,how long did abraham lincoln serve
5190,162662,does adult acne rosacea give you blepharitis
5191,247194,how long do you bake muffins


In [180]:
final_column = []
for idx, text in enumerate(queries_doc['text']):
    text_to_add = most_similar_terms(text)
    new_text = text
    for s in text_to_add:
        new_text += f' {s}'
    final_column.append(new_text)
    if idx % 100 == 0:
        print(f'{idx} / {len(queries_doc)} already expanded')
final_column

0 / 5193 already expanded
100 / 5193 already expanded
200 / 5193 already expanded
300 / 5193 already expanded
400 / 5193 already expanded
500 / 5193 already expanded
600 / 5193 already expanded
700 / 5193 already expanded
800 / 5193 already expanded
900 / 5193 already expanded
1000 / 5193 already expanded
1100 / 5193 already expanded
1200 / 5193 already expanded
1300 / 5193 already expanded
1400 / 5193 already expanded
1500 / 5193 already expanded
1600 / 5193 already expanded
1700 / 5193 already expanded
1800 / 5193 already expanded
1900 / 5193 already expanded
2000 / 5193 already expanded
2100 / 5193 already expanded
2200 / 5193 already expanded
2300 / 5193 already expanded
2400 / 5193 already expanded
2500 / 5193 already expanded
2600 / 5193 already expanded
2700 / 5193 already expanded
2800 / 5193 already expanded
2900 / 5193 already expanded
3000 / 5193 already expanded
3100 / 5193 already expanded
3200 / 5193 already expanded
3300 / 5193 already expanded
3400 / 5193 already expand

['does xpress bet charge to deposit money in your account cash need want sure paid matter let time reason credit',
 'how much is a cost to run disneyland sure want need better thing kind far exactly',
 'botulinum definition toxin synonym',
 'do physicians pay for insurance from their salaries? need paid want money afford expect care come cost',
 'here there be dragons comic know sure thing come thought',
 'blood diseases that are sexually transmitted certain particularly common fact necessarily particular',
 'define bona fides distinguish assert establish',
 'effects of detox juice cleanse diet detoxification herbal treatment alcohol',
 'do prince harry and william have last names know like thought think want sure tell come',
 'can hives be a sign of pregnancy need kind sure given want reason certain',
 'causes of petechial hemorrhage disease occur abnormal failure',
 'how long does it take to get your bsrn if you already have a bachelors degree want sure know need let reason think thi

Not all words seem to be represented equally in the expansion. Maybe we can do better with a different approach. We'll also try selecting one new word for each term in the query.

In [190]:
# Needs to be done once. Uncomment and run to expand queries

# queries_doc['text'] = final_column
# queries_doc.set_index('q_id', inplace=True)
# queries_doc.to_csv('expanded-dev.tsv', sep=' ')

In [204]:
def get_most_similar_word(term: str):
    token = nlp(term)[0]
    best_so_far = - 100
    to_ret = ''
    for word in stemmed_vocabulary:
        sim = cosine_similarity_numba(token.vector, nlp.vocab[word].vector)
        if word != term and sim > best_so_far:
            best_so_far = sim
            to_ret = word
    return to_ret

def most_similar_terms_v2(query_text):
    tokenized_query = nlp(query_text)
    res = [get_most_similar_word(term.text) for term in tokenized_query]
    return res

In [201]:
most_similar_terms_v2('are eggs or grapes better to fight colds')

0.6576089859008789


['tend', 'egg', 'instead', 'grape', 'good', 'bring', 'battle', 'bronchitis']

In [202]:
queries_doc = pd.read_csv('dev/queries.docdev.tsv', sep='\t', names=['q_id', 'text'])
queries_doc

Unnamed: 0,q_id,text
0,174249,does xpress bet charge to deposit money in you...
1,320792,how much is a cost to run disneyland
2,1090270,botulinum definition
3,1101279,do physicians pay for insurance from their sal...
4,201376,here there be dragons comic
...,...,...
5188,147073,difference between discrete and process manufa...
5189,243761,how long did abraham lincoln serve
5190,162662,does adult acne rosacea give you blepharitis
5191,247194,how long do you bake muffins


In [205]:
final_column_v2 = []
for idx, text in enumerate(queries_doc['text']):
    text_to_add = most_similar_terms_v2(text)
    new_text = text
    for s in text_to_add:
        new_text += f' {s}'
    final_column_v2.append(new_text)
    if idx % 100 == 0:
        print(f'{idx} / {len(queries_doc)} already expanded')
final_column_v2

0 / 5193 already expanded
100 / 5193 already expanded
200 / 5193 already expanded
300 / 5193 already expanded
400 / 5193 already expanded
500 / 5193 already expanded
600 / 5193 already expanded
700 / 5193 already expanded
800 / 5193 already expanded
900 / 5193 already expanded
1000 / 5193 already expanded
1100 / 5193 already expanded
1200 / 5193 already expanded
1300 / 5193 already expanded
1400 / 5193 already expanded
1500 / 5193 already expanded
1600 / 5193 already expanded
1700 / 5193 already expanded
1800 / 5193 already expanded
1900 / 5193 already expanded
2000 / 5193 already expanded
2100 / 5193 already expanded
2200 / 5193 already expanded
2300 / 5193 already expanded
2400 / 5193 already expanded
2500 / 5193 already expanded
2600 / 5193 already expanded
2700 / 5193 already expanded
2800 / 5193 already expanded
2900 / 5193 already expanded
3000 / 5193 already expanded
3100 / 5193 already expanded
3200 / 5193 already expanded
3300 / 5193 already expanded
3400 / 5193 already expand

['does xpress bet charge to deposit money in your account thing adobe gamble fee bring cash cash place sure payment',
 'how much is a cost to run disneyland understand far truly kind price bring ran disney',
 'botulinum definition toxin term',
 'do physicians pay for insurance from their salaries? want physician paid good mortgage directly bring wage huh',
 'here there be dragons comic know reason sure dragon superhero',
 'blood diseases that are sexually transmitted liver disease fact tend sexual transmit',
 'define bona fides distinguish fide bona',
 'effects of detox juice cleanse effect certain detoxification lemon detox',
 'do prince harry and william have last names want king potter work robert know week list',
 'can hives be a sign of pregnancy need hive sure kind right certain pregnant',
 'causes of petechial hemorrhage occur certain aa hematoma',
 'how long does it take to get your bsrn if you already have a bachelors degree understand short thing thing want bring want sure aa

In [206]:
# Needs to be done once. Uncomment and run to expand queries

queries_doc['text'] = final_column_v2
queries_doc.set_index('q_id', inplace=True)
queries_doc.to_csv('expanded-dev-v2.tsv', sep=' ')