In [1]:
from elasticsearch.helpers import scan
import tqdm
import numpy as np
import pickle

In [14]:
from elasticsearch import Elasticsearch
client = Elasticsearch("http://localhost:9200", request_timeout=1000)

index_names = ['technical_ind', 'objective_ind']
corpuses = {'technical_ind':{}, 'objective_ind':{}}
for index_name in index_names:
    ndocs = int(client.cat.count(index=index_name, format = "json")[0]['count'])
    print(f"There are {ndocs} documents in the index '{index_name}'")


    corpus = corpuses[index_name]    # will store _normalized_ tfidf for each document, key is internal elasticsearch id, value is dictionary of term -> tf-idf weight
    for s in tqdm.tqdm(scan(client, index=index_name, query={"query" : {"match_all": {}}}), total=ndocs):
        terms = []
        freqs = []
        dfs = []

        tv = client.termvectors(index=index_name, id=s['_id'], fields=['text'], term_statistics=True, positions=False)
        if 'text' in tv['term_vectors']:   # just in case some document has no field named 'text'
            for t in tv['term_vectors']['text']['terms']:
                f = tv['term_vectors']['text']['terms'][t]['term_freq']

                terms.append(t)
                freqs.append(tv['term_vectors']['text']['terms'][t]['term_freq'])
                dfs.append(tv['term_vectors']['text']['terms'][t]['doc_freq'])

        # vector computations for tf-idf; l2-normalized for further calculations..
        tfidf = np.array(freqs) * np.log2(ndocs / np.array(dfs))
        tfidf /= np.linalg.norm(tfidf)

        # save in corpus dictionary
        corpus[s['_source']['path']] = {t: tfidf[j] for j, t in enumerate(terms)}



100%|██████████| 924/924 [00:10<00:00, 85.24it/s] 


100%|██████████| 924/924 [00:07<00:00, 119.74it/s]


In [None]:
from rich import print

print(corpuses['objective_ind'].keys())

In [3]:
### imports ###

import numpy as np
import pickle
import heapq
import tqdm
import uuid
from pprint import pprint
from collections import Counter, defaultdict
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from elasticsearch_dsl import Search, Index, analyzer, tokenizer
from elasticsearch_dsl.query import Q

In [6]:
def stemmer(query: str) -> str:
    res = ind.analyze(body={'analyzer':'default', 'text': query})
    query_stemmed = ''
    first = True
    for r in res['tokens']:
        if not first:
            query_stemmed += ' ' + r['token']
        else:
            query_stemmed += r['token']
            first = False
    return query_stemmed

In [8]:
def norm(d: list[tuple[str, float]]) -> float:
    return np.sqrt(sum([freq*freq for term, freq in d]))


def normalize(d1: list[tuple[str, float]]):
    normm = norm(d1)
    return [(k, v/normm) for k, v in d1]

In [57]:
from elasticsearch.helpers import scan
from pprint import pprint
from elasticsearch import Elasticsearch
import tqdm
import numpy as np

client = Elasticsearch("http://localhost:9200", request_timeout=1000)

r = 10  # only return r top docs
queries = ['win prize many top dive trophy limit victory','learn skills dive improve gain experience', 'first try begin people knowledge start','level experiment journey collaborate experience']
sims : dict[str, dict[int,float]] = {}

l2query  = [np.sqrt(len(query.split())) for query in queries]  # l2 of query assuming 0-1 vector representation

# get nr. of docs; just for the progress bar
ndocs = int(client.cat.count(index='objective_ind', format = "json")[0]['count'])

# scan through docs, compute cosine sim between query and each doc
for s in tqdm.tqdm(scan(client, index='objective_ind', query={"query" : {"match_all": {}}}), total=ndocs):
    
    docid = s['_source']['path']   # use path as id
    weights = corpuses['objective_ind'][docid]   # gets weights as a python dict of term -> weight (see remark above)
    docid = docid.split('/')[-1].replace('.txt', '')
    sims[docid] = {}
    for i in range(len(queries)):
        sims[docid][i] = 0.0
        for w in queries[i].split():  # gets terms as a list
            if w in weights:    # probably need to do something fancier to make sure that word is in vocabulary etc.
                sims[docid][i] += weights[w]   # accumulates if w in current doc
        # normalize sim
        sims[docid][i] /= l2query[i]

# now sort by cosine similarity
#sorted_answer = sorted(sims.items(), key=lambda kv: kv[1], reverse=True)

#pprint(sorted_answer[:r])


100%|██████████| 924/924 [00:00<00:00, 2311.75it/s]


In [None]:
for i in sims.keys():
    if sims[i][2] > 0.1:
        print(i, sims[i])

In [18]:
corpuses['objective_ind']['Objectives_files/fcee953a-30c6-475a-b65c-ec49223281e9.txt']

{'abil': 0.08397771618210773,
 'be': 0.11216450959639765,
 'been': 0.12377455366192551,
 'best': 0.07079818365769823,
 'blockchain': 0.12369664517208896,
 'bring': 0.06339890890171522,
 'clear': 0.11477064158931573,
 'close': 0.1867315612323537,
 'code': 0.01913849682507567,
 'collabor': 0.11102753009373281,
 'come': 0.0394593597615803,
 'competit': 0.04310459432894165,
 'confid': 0.06937303615927695,
 'datathon': 0.0003729985232227365,
 'determin': 0.11344782914572797,
 'develop': 0.06732016807573349,
 'estrada': 0.2637557394698379,
 'few': 0.28169887678795363,
 'focus': 0.09071387117190104,
 'friend': 0.05304723702105247,
 'goal': 0.049260997663310074,
 'great': 0.13497516341996343,
 'hackathon': 0.09557820293114692,
 'hard': 0.11754502433468812,
 'have': 0.032385708674737446,
 'hei': 0.0050283798806660465,
 'here': 0.07372016235676383,
 'home': 0.16807355915130623,
 'hour': 0.10206922234885396,
 'hunger': 0.16477218299606786,
 'i': 0.0,
 'industri': 0.12453139008722679,
 'iot': 0.13

In [10]:
from participant import load_participants
from rich import print
import uuid
import os



data_path = "data/datathon_participants.json"
participants = load_participants(data_path)

objectives : dict[uuid.UUID,str] = {}

technical : dict[uuid.UUID,str] = {}


for p in participants:
    objectives[p.id] = p.objective + " " + p.introduction
    technical[p.id] = p.technical_project + " " + p.future_excitement


In [45]:
word_counts : dict[str,int] = {}
for key, value in objectives.items():
    for word in value.split():
        if word not in word_counts: 
            word_counts[word] = 0
        word_counts[word] += 1

In [56]:
sorted_answer = sorted(word_counts.items(), key=lambda kv: kv[1], reverse=True)


print(word_counts['prize'])


In [20]:
print(objectives['fcee953a-30c6-475a-b65c-ec49223281e9'])

In [21]:
text = 'Objectives_files/fcee953a-30c6-475a-b65c-ec49223281e9.txt'
resultat = text.split('/')[-1].replace('.txt', '')
print(resultat)