### Imports

In [103]:
import tqdm
import numpy as np
import pickle
from rich import print
import heapq
import uuid
from pprint import pprint
from collections import Counter, defaultdict
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from elasticsearch_dsl import Search, Index, analyzer, tokenizer
from elasticsearch_dsl.query import Q

### TF-IDF

In [104]:
from elasticsearch import Elasticsearch
client = Elasticsearch("http://localhost:9200", request_timeout=1000)

index_names = ['technical_ind', 'objective_ind']
corpuses = {'technical_ind':{}, 'objective_ind':{}}
for index_name in index_names:
    ndocs = int(client.cat.count(index=index_name, format = "json")[0]['count'])
    print(f"There are {ndocs} documents in the index '{index_name}'")


    corpus = corpuses[index_name]    # will store _normalized_ tfidf for each document, key is internal elasticsearch id, value is dictionary of term -> tf-idf weight
    for s in tqdm.tqdm(scan(client, index=index_name, query={"query" : {"match_all": {}}}), total=ndocs):
        terms = []
        freqs = []
        dfs = []

        tv = client.termvectors(index=index_name, id=s['_id'], fields=['text'], term_statistics=True, positions=False)
        if 'text' in tv['term_vectors']:   # just in case some document has no field named 'text'
            for t in tv['term_vectors']['text']['terms']:
                f = tv['term_vectors']['text']['terms'][t]['term_freq']

                terms.append(t)
                freqs.append(tv['term_vectors']['text']['terms'][t]['term_freq'])
                dfs.append(tv['term_vectors']['text']['terms'][t]['doc_freq'])

        # vector computations for tf-idf; l2-normalized for further calculations..
        tfidf = np.array(freqs) * np.log2(ndocs / np.array(dfs))
        tfidf /= np.linalg.norm(tfidf)

        # save in corpus dictionary
        corpus[s['_source']['path']] = {t: tfidf[j] for j, t in enumerate(terms)}



100%|██████████| 924/924 [00:01<00:00, 473.90it/s]


100%|██████████| 924/924 [00:01<00:00, 480.71it/s]


#### Basic functions

In [108]:
def norm(d: list[tuple[str, float]]) -> float:
    return np.sqrt(sum([freq*freq for _, freq in d]))


def normalize(d1: list[tuple[str, float]]):
    normm = norm(d1)
    return [(k, v/normm) for k, v in d1]

def stemmer(query: str) -> str:
    res = ind.analyze(body={'analyzer':'default', 'text': query})
    query_stemmed = ''
    first = True
    for r in res['tokens']:
        if not first:
            query_stemmed += ' ' + r['token']
        else:
            query_stemmed += r['token']
            first = False
    return query_stemmed

### Objective_ind similarities

In [132]:
from elasticsearch.helpers import scan
from pprint import pprint
from elasticsearch import Elasticsearch
import tqdm
import numpy as np

client = Elasticsearch("http://localhost:9200", request_timeout=1000)

r = 10  # only return r top docs
queries = ['win prize many top dive trophy limit victory','learn skills dive improve gain experience', 'first try begin people knowledge start','level experiment journey collaborate experience']
sims_ob : dict[str, dict[int,float]] = {}

l2query  = [np.sqrt(len(query.split())) for query in queries]  # l2 of query assuming 0-1 vector representation

# get nr. of docs; just for the progress bar
ndocs = int(client.cat.count(index='objective_ind', format = "json")[0]['count'])

# scan through docs, compute cosine sim between query and each doc
for s in tqdm.tqdm(scan(client, index='objective_ind', query={"query" : {"match_all": {}}}), total=ndocs):
    
    docid = s['_source']['path']   # use path as id
    weights = corpuses['objective_ind'][docid]   # gets weights as a python dict of term -> weight (see remark above)
    docid = docid.split('/')[-1].replace('.txt', '')
    sims_ob[docid] = {}
    for i in range(len(queries)):
        sims_ob[docid][i] = 0.0
        for w in queries[i].split():  # gets terms as a list
            if w in weights:    # probably need to do something fancier to make sure that word is in vocabulary etc.
                sims_ob[docid][i] += weights[w]   # accumulates if w in current doc
        # normalize sim
        sims_ob[docid][i] /= l2query[i]

# now sort by cosine similarity
#sorted_answer = sorted(sims.items(), key=lambda kv: kv[1], reverse=True)

#pprint(sorted_answer[:r])


  0%|          | 0/924 [00:00<?, ?it/s]

100%|██████████| 924/924 [00:00<00:00, 25174.32it/s]


#### Print de les sims

In [138]:
"""print('OBJECTIVE INDEX')
for i in sims_ob.keys():
    if sims_ob[i][2] > 0.1:
        print(i, sims_ob[i])"""

### Technical_ind similarities (no funciona, no sé per què)

In [112]:
from elasticsearch.helpers import scan
from pprint import pprint
from elasticsearch import Elasticsearch
import tqdm
import numpy as np

client = Elasticsearch("http://localhost:9200", request_timeout=1000)

r = 10  # only return r top docs
queries = ['Python', 'React', 'PostgreSQL', 'Figma', 'C++', 'Java', 'React', 'Pytorch', 'SQL', 'HTML/CSS', 'MongoDB', 'Google', 'Flutter', 'Amazon', 'Raspberry', 'TensorFlow', 'AR/VR']
sims : dict[str, dict[int,float]] = {}

l2query  = [np.sqrt(len(query.split())) for query in queries]  # l2 of query assuming 0-1 vector representation

# get nr. of docs; just for the progress bar
ndocs = int(client.cat.count(index='technical_ind', format = "json")[0]['count'])

# scan through docs, compute cosine sim between query and each doc
for s in tqdm.tqdm(scan(client, index='technical_ind', query={"query" : {"match_all": {}}}), total=ndocs):
    docid = s['_source']['path']   # use path as id
    weights = corpuses['technical_ind'][docid]   # gets weights as a python dict of term -> weight (see remark above)
    docid = docid.split('/')[-1].replace('.txt', '')
    sims[docid] = {}
    for i in range(len(queries)):
        sims[docid][i] = 0.0
        for w in queries[i].split():  # gets terms as a list
            if w in weights:    # probably need to do something fancier to make sure that word is in vocabulary etc.
                print('Aqui')
                sims[docid][i] += weights[w]   # accumulates if w in current doc
        # normalize sim
        sims[docid][i] /= l2query[i]

sims = {key: sum(subdict.values()) for key, subdict in sims.items()}

100%|██████████| 924/924 [00:00<00:00, 20354.28it/s]


In [114]:
print('OBJECTIVE INDEX')
for i in sims.keys():
    if sims[i] > 0.01:
        print(i, sims[i])

### PROVES DIVERSES

In [None]:
from participant import load_participants
from rich import print
import uuid
import os

data_path = "data/datathon_participants.json"
participants = load_participants(data_path)

objectives : dict[uuid.UUID,str] = {}
technical : dict[uuid.UUID,str] = {}


for p in participants:
    objectives[p.id] = p.objective + " " + p.introduction
    technical[p.id] = p.technical_project + " " + p.future_excitement


word_counts : dict[str,int] = {}
for key, value in technical.items():
    for word in value.split():
        if word not in word_counts: 
            word_counts[word] = 0
        word_counts[word] += 1

sorted_answer = sorted(word_counts.items(), key=lambda kv: kv[1], reverse=True)
print(sorted_answer)
print(objectives['fcee953a-30c6-475a-b65c-ec49223281e9'])

text = 'Objectives_files/fcee953a-30c6-475a-b65c-ec49223281e9.txt'
resultat = text.split('/')[-1].replace('.txt', '')
print(resultat)

### NEW DATA

In [152]:
import json
import pathlib
import uuid
from dataclasses import dataclass
from typing import Dict, List, Literal


@dataclass
class Participant:
    id: uuid.UUID  # Unique identifier

    # Personal data
    name: str
    email: str
    age: int
    year_of_study: Literal["1st year", "2nd year", "3rd year", "4th year", "Masters", "PhD"]
    shirt_size: Literal["S", "M", "L", "XL"]
    university: str
    dietary_restrictions: Literal["None", "Vegetarian", "Vegan", "Gluten-free", "Other"]

    # Experience and programming skills
    programming_skills: Dict[str, int]
    experience_level: Literal["Beginner", "Intermediate", "Advanced"]
    hackathons_done: int

    # Interests, preferences and constraints
    interests: List[str]
    preferred_role: Literal[
        "Analysis", "Visualization", "Development", "Design", "Don't know", "Don't care"
    ]

    interest_in_challenges: List[str]
    preferred_languages: List[str]
    friend_registration: List[uuid.UUID]
    preferred_team_size: int
    availability: Dict[str, bool]

    # Description of the participant
    Tryhard: float = 0
    Learner: float = 0
    Rookie: float = 0
    Portfolio: float = 0

In [157]:
def load_participants(path: str, sims: Dict[uuid.UUID, List[float]]) -> List[Participant]:
    if not pathlib.Path(path).exists():
        raise FileNotFoundError(
            f"The file {path} does not exist, are you sure you're using the correct path?"
        )
    if not pathlib.Path(path).suffix == ".json":
        raise ValueError(
            f"The file {path} is not a JSON file, are you sure you're using the correct file?"
        )

    participants_data = json.load(open(path))
    participants = []
    
    for participant_data in participants_data:
        # Remove fields that aren't in the Participant class
        if 'objective' in participant_data:
            del participant_data['objective']
        if 'introduction' in participant_data:
            del participant_data['introduction']
        if 'fun_fact' in participant_data:
            del participant_data['fun_fact']
        if 'future_excitement' in participant_data:
            del participant_data['future_excitement']
        if 'technical_project' in participant_data:
            del participant_data['technical_project']
            
        # Convert the ID string to UUID
        participant_id = str(participant_data['id'])
        
        # If this participant has simulation data, update their values
        if participant_id in sims.keys():
            participant_data['Tryhard'] = sims[participant_id][0]
            participant_data['Learner'] = sims[participant_id][1]
            participant_data['Rookie'] = sims[participant_id][2]
            participant_data['Portfolio'] = sims[participant_id][3]
          
        # Create the participant instance with the updated data
        participants.append(Participant(**participant_data))
    
    return participants


participants1 = load_participants('/Users/cristinateixidocruilles/Desktop/Datathon24/data/datathon_participants.json', sims_ob)

In [158]:
import csv
import pandas as pd

def participants_to_csv(participants: List[Participant], output_file: str):
    """Convert list of Participant objects to CSV file."""
    # Get all fields from the first participant
    fieldnames = [field for field in vars(participants[0])]
    
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        
        # Write header
        writer.writerow(fieldnames)
        
        # Write each participant's data
        for participant in participants:
            row = []
            for field in fieldnames:
                value = getattr(participant, field)
                # Convert complex types to strings
                if isinstance(value, (dict, list)):
                    value = str(value)
                row.append(value)
            writer.writerow(row)

# Example usage
participants_to_csv(participants1, 'participants.csv')