In [57]:
from participant import load_participants
from rich import print
import pandas as pd
import numpy as np
import math
from dataclasses import dataclass
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
@dataclass
class Participant:
    id: str
    university: str
    interests: list[str]
    preferred_role: str
    objective: str
    introduction: str
    technical_project: str
    future_excitement: str
    fun_fact: str
    friend_registration: list[str]
    preferred_team_size: int
    availability: list[int]
    programming_skills: dict[str, int]
    interest_in_challenges: list[str]
    experience: float
    languages_ordered: list[str]
    maturity: float

In [7]:
INF = 999999

In [None]:
data_path = "data/datathon_participants.json"
participants = load_participants(data_path)


In [64]:
def dist_university(university_1, university_2) :
    if university_1 == university_2:
        return 0 
    else:
        return INF

In [None]:
#No usar
def dist_age_year_of_study(age_1, age_2, year_1, year_2, age_weight, year_weigth) -> float:
    age_distance = abs(age_1 - age_2)
    year_distance = abs(year_1 - year_2)
    return (age_weight * age_distance + year_weigth * year_distance) / (age_distance + year_distance)


In [5]:
def dist_one_hot_encoding(x, y):
    return sum([1 if x_i != y_i else 0 for x_i, y_i in zip(x, y)])

In [None]:
def dist_programming_skills(skills_1, skills_2) -> float:
    avg_level_1 = sum(skills_1.values()) / len(skills_1) if skills_1 else 0
    
    avg_level_2 = sum(skills_2.values()) / len(skills_2) if skills_2 else 0
    
    distance_avg_levels = abs(avg_level_1 - avg_level_2)
    
    all_skills = set(skills_1.keys()).union(set(skills_2.keys()))
    
    distance_union = 1 / len(all_skills) if all_skills else INF 
    
    return distance_avg_levels + distance_union


In [None]:
#No usar
def dist_experience(exp_1, exp_2, hackathons_1, hackathons_2, experience_weights) -> float:
    exp_distance = abs(exp_1 - exp_2)
    hackathon_distance = abs(np.log1p(hackathons_1) - np.log1p(hackathons_2))
    total_distance = experience_weights[0] * exp_distance + experience_weights[1] * hackathon_distance
    return total_distance

In [13]:
def dist_preferred_role(role_1, role_2):
    if role_1 == role_2:
        return 1.0  

    if role_1 == "Don't know" or role_2 == "Don't know":
        return 1 / 0.2  

    if role_1 == "Don't care" or role_2 == "Don't care":
        return 0.5  

    return 0.0

In [None]:
#No usar
def euclidean_distance(vector_1, vector_2):
    squared_diff_sum = sum((x - y) ** 2 for x, y in zip(vector_1, vector_2))
    
    return math.sqrt(squared_diff_sum)

In [55]:
def dist_language(languages_1, languages_2):
    if not languages_1 or not languages_2:  
        return 0
    
    common_languages = set(languages_1).intersection(set(languages_2))
    
    if not common_languages:  
        return INF

    distance = 0
    for i, lang_1 in enumerate(languages_1):
        for j, lang_2 in enumerate(languages_2):
            if lang_1 == lang_2:
                d = abs(i - j)
                return d + math.exp(i+j) - 1

In [68]:
df = pd.read_csv('clean_data.csv')

participants = {row['id']: Participant(**row) for _, row in df.iterrows()}


In [71]:
weights = {
    'university': 0.2,
    'interests': 0.2,
    'preferred_role': 0.3,
    'availability': 0.1,
    'programming_skils': 0.1,
    'interests_in_challenges': 0.05,
    'languages': 0.05,
    'experience': 0.05,
    'maturity': 0.15
}


In [None]:
def combined_distance(participant1: Participant, participant2: Participant, weigths: dict[str,float]):
    d_university = dist_university(participant1.university, participant2.university)*weigths['university']
    d_interests = dist_one_hot_encoding(participant1.interests, participant2. interests)*weigths['interests']
    d_preferred_role = dist_preferred_role(participant1.preferred_role, participant2. preferred_role)*weigths['preferred_role']
    d_availability = dist_one_hot_encoding(participant1.availability, participant2. availability)*weigths['availability']
    d_programming_skills = dist_programming_skills(participant1.programming_skills, participant2.programming_skills)*weigths['programming_skils']
    d_interests_in_challenges = dist_one_hot_encoding(participant1.interest_in_challenges, participant2.interest_in_challenges)*weigths['interests_in_challenges']
    d_languages = dist_language(participant1.languages_ordered, participant2.languages_ordered)*weigths['languages']
    d_experience = abs(participant1.experience - participant2.experience)*weigths['experience']
    d_maturity = abs(participant1.maturity - participant2.maturity)*weigths['maturity']
    total_distance = (
        d_university +
        d_interests +
        d_preferred_role +
        d_availability +
        d_programming_skills +
        d_interests_in_challenges +
        d_languages + 
        d_experience +
        d_maturity
    )
    return total_distance

In [73]:
participants['2ebad15c-c0ef-4c04-ba98-c5d98403a90c']

Participant(id='2ebad15c-c0ef-4c04-ba98-c5d98403a90c', university='Universitat Internacional de Catalunya (UIC)', interests='[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]', preferred_role='Design', objective="I'm super stoked to be participating in this datathon! My goal is to soak up the vibes, learn from others, and have an absolute blast. I want to join in on as many events and workshops as I can, learn new skills and insights, and make friends with like-minded peeps. I'm more about having fun and making connections than about trying to win (although, I do love a good challenge!). My objective is to leave this datathon feeling refreshed, inspired, and with new friendships to look back on. Bring it on!", introduction="Hi there! I'm Sara, and I'm super excited to be here at the datathon. As a university student, I'm always looking for new challenges and ways to learn. I've got a background in programming and love fiddling with coding - whether it's building 

In [72]:
print(combined_distance(participants['2ebad15c-c0ef-4c04-ba98-c5d98403a90c'],participants['2ebad15c-c0ef-4c04-ba98-c5d98403a90c'], weights))

AttributeError: 'str' object has no attribute 'values'