In [1]:
from elasticsearch.helpers import scan
import tqdm
import numpy as np
import pickle

In [2]:
from elasticsearch import Elasticsearch
client = Elasticsearch("http://localhost:9200", request_timeout=1000)

index_names = ['technical_ind', 'objective_ind']
corpuses = {'technical_ind':{}, 'objective_ind':{}}
for index_name in index_names:
    ndocs = int(client.cat.count(index=index_name, format = "json")[0]['count'])
    print(f"There are {ndocs} documents in the index '{index_name}'")


    corpus = corpuses[index_name]    # will store _normalized_ tfidf for each document, key is internal elasticsearch id, value is dictionary of term -> tf-idf weight
    for s in tqdm.tqdm(scan(client, index=index_name, query={"query" : {"match_all": {}}}), total=ndocs):
        terms = []
        freqs = []
        dfs = []

        tv = client.termvectors(index=index_name, id=s['_id'], fields=['text'], term_statistics=True, positions=False)
        if 'text' in tv['term_vectors']:   # just in case some document has no field named 'text'
            for t in tv['term_vectors']['text']['terms']:
                f = tv['term_vectors']['text']['terms'][t]['term_freq']

                terms.append(t)
                freqs.append(tv['term_vectors']['text']['terms'][t]['term_freq'])
                dfs.append(tv['term_vectors']['text']['terms'][t]['doc_freq'])

        # vector computations for tf-idf; l2-normalized for further calculations..
        tfidf = np.array(freqs) * np.log2(ndocs / np.array(dfs))
        tfidf /= np.linalg.norm(tfidf)

        # save in corpus dictionary
        corpus[s['_id']] = {t: tfidf[j] for j, t in enumerate(terms)}



There are 924 documents in the index 'technical_ind'


100%|██████████| 924/924 [00:08<00:00, 111.39it/s]


There are 924 documents in the index 'objective_ind'


100%|██████████| 924/924 [00:08<00:00, 107.07it/s]


In [None]:
from rich import print

print(corpuses['technical_ind'].keys())

In [7]:
### imports ###

import numpy as np
import pickle
import heapq
import tqdm
import uuid
from pprint import pprint
from collections import Counter, defaultdict
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from elasticsearch_dsl import Search, Index, analyzer, tokenizer
from elasticsearch_dsl.query import Q

In [6]:
def stemmer(query: str) -> str:
    res = ind.analyze(body={'analyzer':'default', 'text': query})
    query_stemmed = ''
    first = True
    for r in res['tokens']:
        if not first:
            query_stemmed += ' ' + r['token']
        else:
            query_stemmed += r['token']
            first = False
    return query_stemmed

In [8]:
def norm(d: list[tuple[str, float]]) -> float:
    return np.sqrt(sum([freq*freq for term, freq in d]))


def normalize(d1: list[tuple[str, float]]):
    normm = norm(d1)
    return [(k, v/normm) for k, v in d1]

In [14]:
from elasticsearch.helpers import scan
from pprint import pprint
from elasticsearch import Elasticsearch
import tqdm
import numpy as np

client = Elasticsearch("http://localhost:9200", request_timeout=1000)

r = 10  # only return r top docs
query = 'win'
sims = dict()

l2query  = np.sqrt(len(query.split()))  # l2 of query assuming 0-1 vector representation

# get nr. of docs; just for the progress bar
ndocs = int(client.cat.count(index='objective_ind', format = "json")[0]['count'])

# scan through docs, compute cosine sim between query and each doc
for s in tqdm.tqdm(scan(client, index='objective_ind', query={"query" : {"match_all": {}}}), total=ndocs):
    docid = s['_source']['path']   # use path as id
    weights = corpuses['objective_ind'][docid]   # gets weights as a python dict of term -> weight (see remark above)
    sims[docid] = 0.0
    for w in query.split():  # gets terms as a list
        if w in weights:    # probably need to do something fancier to make sure that word is in vocabulary etc.
            sims[docid] += weights[w]   # accumulates if w in current doc
    # normalize sim
    sims[docid] /= l2query

# now sort by cosine similarity
sorted_answer = sorted(sims.items(), key=lambda kv: kv[1], reverse=True)
pprint(sorted_answer[:r])


  0%|          | 0/924 [00:00<?, ?it/s]


KeyError: 'Objectives_files/00a24006-cb98-4dea-8750-52dcd93aac92.txt'