In [None]:
from openai import AsyncOpenAI
import asyncio
import numpy as np
import json
from itertools import combinations, chain
import math
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, date, timedelta
import time
import asyncio
api_key = open("api_key_unlimited.txt").read().strip()
client = AsyncOpenAI(api_key = api_key)

response = await client.embeddings.create(input=["Hello, world!"] ,model = "text-embedding-3-large")

In [None]:
async def get_embedding(text, model="text-embedding-3-large"):
    start = time.time()
    if not text:
        return []
    if type(text) != list:
        start = time.time()
        response = await client.embeddings.create(input=[text], model=model)
        end = time.time()
        print(f"Start time {start}, end time {end}, Time taken: {end-start}")
        return response.data[0].embedding
    response = await client.embeddings.create(input=text, model=model)
    end = time.time()
    print(f"Start time {start}, end time {end}, Time taken: {end-start}")
    return [r.embedding for r in response.data]

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def flatten_values(values):
    for value in values:
        if isinstance(value, list):
            for item in value:
                yield item
        else:
            yield value

In [None]:
# temp = ['This is so weird'] * 100
# await asyncio.gather(*[get_embedding(temp) for _ in range(1000)])

In [None]:
DEGREE_ALIAS = {
    "highschool": ["high_school", "high school", "highschool", "high-school", "secondary school", "secondaryschool", "secondary-school"],
    "associate": ["associate", "associates", "associate's", "associates'", "associate bachelor", "associate bs"],
    "bs": ["bachelor", "bachelors", "bachelor's", "bachelors'", "undergraduate", "undergrad", "undergrads", "undergrad's", "undergrads'", "bs", "ba", "b.sc", "b.sc.", "bsc", "bachelor of science", "bachelor of arts", "b.a.","b.a"],
    "ma": ["master", "masters", "master's", "masters'", "graduate", "grad", "grads", "grad's", "grads'", "ms", "ma", "m.sc", "m.sc.", "msc", "master of science", "master of arts", "m.a.","m.a"],
    "phd": ["phd", "ph.d", "doctorate", "doctoral", "doctor", "doctor's", "doctorate of philosophy", "doctorate of philosophy", "doctor of philosophy"],
    "md": ["md", "m.d", "doctor of medicine", "doctorate of medicine"],
    "do" : ["do", "d.o", "doctor of osteopathic medicine", "doctorate of osteopathic medicine"],
    "jd": ["jd", "j.d", "juris doctor", "juris doctorate"],
    "pharmd": ["pharmd", "pharm.d", "doctor of pharmacy", "doctorate of pharmacy"],
    "mba": ["mba", "m.b.a", "master of business administration"],
    "dba": ["dba", "d.b.a", "doctor of business administration"],
}
potential_degrees = list(DEGREE_ALIAS.keys())
potential_degrees_embeddings = await asyncio.gather(*[get_embedding(potential) for potential in potential_degrees])

for key in DEGREE_ALIAS:
    DEGREE_ALIAS[key] = sorted(DEGREE_ALIAS[key], key = len, reverse = True)
    

In [None]:
async def get_profile_embeddings(profile):
    results = []
    data_list = []
    num_education = len(profile['education']) # each education will have 10 embeddings
    for e in profile['education']:
        data_list.append(e['education_school'])
        data_list.append(e['education_degree'])
        data_list.append(e['education_school'] + " " + e['education_degree'] + " " + e['education_major'])
        data_list.append(e['education_major'])
        data_list.append(e['education_major'] + " " + e['education_description'])
        data_list.append(e['education_major'] + " " + e['education_description'] + " " + e['education_activities'])
        data_list.append(e['education_major'] + " " + e['education_activities'])
        data_list.append(e['education_description'])
        data_list.append(e['education_description'] + " " + e['education_activities'])
        data_list.append(e['education_activities'])

    num_position = len(profile['position']) # each position will have 4 embeddings
    for p in profile['position']:
        data_list.append(p['position_company_name'])
        data_list.append(p['position_location'])
        data_list.append(p['position_summary'])
        data_list.append(p['position_title'])
    
    # remove empty strings
    data_list = [d if d else " " for d in data_list]
    embeddings = await get_embedding(data_list)
    for _ in range(num_education):
        result = {
            'education_school_embedding' : embeddings[0],
            'education_degree_embedding' : embeddings[1],
            'education_embedding' : embeddings[2],
            'education_major_embedding' : [1, embeddings[9], embeddings[7], embeddings[8], embeddings[3], embeddings[6], embeddings[4], embeddings[5]],
        }
        results.append(result)
        embeddings = embeddings[10:]
    for _ in range(num_position):
        result = {
            'position_company_name_embedding' : embeddings[0],
            'position_location_embedding' : embeddings[1],
            'position_summary_embedding' : embeddings[2],
            'position_title_embedding' : embeddings[3],
        }
        results.append(result)
        embeddings = embeddings[4:]
    return results

In [None]:
def string_from_education(education):
    string = ""
    if not education:
        return "null"
    for i in education:
        i["education_id"] = i["education_school_linkedin_url"] = i['education_school_logo'] = ''
        string += " ".join(value for value in flatten_values(i.values()))
        string += "."
    return string

def strings_from_education(education):
    strings = []
    for i in education:
        i["education_id"] = i["education_school_linkedin_url"] = i['education_school_logo'] = ''
        strings.append(" ".join(value for value in flatten_values(i.values())))
    return strings

def degree_normalization(degree):
    if not degree:
        return " "
    
    degree = degree.lower()
    for key, values in DEGREE_ALIAS.items():
        if any(value in degree for value in values):
            return key

    return degree

# helper function to calculate the similarity between two education entries
def calc_school_similarity(target, similar):
    if target['education_school'] == '' or similar['education_school'] == '':
        return 1
    return np.dot(target['education_school_embedding'], similar['education_school_embedding'])

def calc_degree_similarity(target, similar):
    return np.dot(target['education_degree_embedding'], similar['education_degree_embedding'])

def calc_major_similarity(target, similar):
    if target['education_major'] in similar['education_major'] or similar['education_major'] in target['education_major']:
        return 1

    index = 0
    if target['education_major'] != '' and similar['education_major'] != '':
        index += 4
    if target['education_description'] != '' and similar['education_description'] != '':
        index += 2
    if target['education_activities'] != '' and similar['education_activities'] != '':
        index += 1
    
    target_embedding = target['education_major_embedding'][index]
    similar_embedding = similar['education_major_embedding'][index]

    return np.dot(target_embedding, similar_embedding)

def calc_time_similarity(target, similar):
    # initialize time similarity to 1 (and as for no all time stamps available)
    time_similarity = 1
    if target['education_startyear'] == '' or similar['education_startyear'] == '':
        return int(target['education_endyear'] == similar['education_endyear'])
    elif target['education_endyear'] == '' or similar['education_endyear'] == '':
        return int(target['education_startyear'] == similar['education_startyear'])

    # at this point we know that both startyear and endyear are available
    target_startyear = int(target['education_startyear'])
    target_endyear = int(target['education_endyear'])
    similar_startyear = int(similar['education_startyear'])
    similar_endyear = int(similar['education_endyear'])
    value = max(0, min(target_endyear, similar_endyear) - max(target_startyear, similar_startyear))
    target_duration = abs(target_endyear - target_startyear)
    similar_duration = abs(similar_endyear - similar_startyear)
    if target_duration == 0 and similar_duration == 0:
        time_similarity = int(target_startyear == similar_startyear)
    else:
        time_similarity = 2 * value / (target_duration + similar_duration)
    return time_similarity

# return a list of dictionaries of information (check variable result) for each education entry
async def get_weighted_embedding(education):
    async def process_single_education(e):
        result = {
            'education_embedding': 0, 
            'education_school_embedding': 0, 
            'education_degree_embedding': 0, 
            'education_major_embedding': [], 
            'education_year_score': 0, 
            'education_string': '', 
            'education_school': '', 
            'education_degree': '', 
            'education_major': '', 
            'education_description': '',
            'education_activities': '',
            'education_startyear': '', 
            'education_endyear': ''
        }
        result['education_school'] = e['education_school'] if e['education_school'] != "" else " "
        result['education_degree'] = e['education_degree_level'] if e['education_degree_level'] != "" else degree_normalization(e['education_degree'])
        result['education_major'] = e['education_major'] if e['education_major'] != "" else " "
        result['education_description'] = e['education_description'] if e['education_description'] != "" else " "
        result['education_activities'] = e['education_activities'] if e['education_activities'] != "" else " "
        result['education_startyear'] = e['education_startyear']
        result['education_endyear'] = e['education_endyear'] 
        result['education_string'] = string_from_education([e])

        embeddings = await (
            get_embedding([
                result['education_school'], 
                result['education_degree'],
                result['education_school'] + " " + result['education_degree'] + " " + result['education_major'],
                result['education_major'],
                result['education_major'] + " " + result['education_description'],
                result['education_major'] + " " + result['education_description'] + " " + result['education_activities'],
                result['education_major'] + " " + result['education_activities'],
                result['education_description'],
                result['education_description'] + " " + result['education_activities'],
                result['education_activities']
            ])
        )

        result['education_school_embedding'] = embeddings[0]
        result['education_degree_embedding'] = embeddings[1]
        result['education_embedding'] = embeddings[2]
        result['education_major_embedding'] = [1, embeddings[9], embeddings[7], embeddings[8], embeddings[3], embeddings[6], embeddings[4], embeddings[5]]

        return result

    # Process each education entry concurrently
    results = await asyncio.gather(*[process_single_education(e) for e in education])
    
    return results

async def calculate_similarity_weighted(target_education, similar_education):
    
    target_embedding, similar_embedding = await asyncio.gather(
            get_weighted_embedding(target_education),
            get_weighted_embedding(similar_education)
        )

    if not target_embedding or not similar_embedding:
        return 0
    
    target_similar_mapping = {}

    # Generate all combinations of target and similar embeddings
    all_combinations = [(t, s) for t in target_embedding for s in similar_embedding]
    
    combination_similarities = []
    for (t, s) in all_combinations:
        # Calculate school, degree, and time similarities synchronously
        school_similarity = calc_school_similarity(t, s)
        degree_similarity = calc_degree_similarity(t, s)
        time_similarity = calc_time_similarity(t, s)

        # Calculate major similarity asynchronously
        major_similarity = calc_major_similarity(t, s)

        # Combine similarities and store in the list
        similarity_score = school_similarity * degree_similarity * major_similarity * time_similarity
        combination_similarities.append((t, s, similarity_score))
    
    while combination_similarities:
        # Find the pair with the maximum similarity score
        (t_max, s_max, max_similarity) = max(combination_similarities, key=lambda x: x[2])

        # Map the target to the most similar entry in the similar set
        target_similar_mapping[t_max['education_string']] = (s_max['education_string'], max_similarity)

        # Remove combinations involving the matched target and similar entries
        combination_similarities = [
            (t, s, sim) for t, s, sim in combination_similarities 
            if t != t_max and s != s_max
        ]

    return target_similar_mapping

In [None]:

async def get_position_info(position):
    async def process_single_position(pos):
        result = {
            'position_company_name': pos['position_company_name'] if pos['position_company_name'] else ' ',
            'position_end_date': 0,
            'position_location': pos['position_location'] if pos['position_location'] else ' ',
            'position_start_date': 0,
            'position_summary': pos['position_summary'] if pos['position_summary'] else ' ',
            'position_title': pos['position_title'] if pos['position_title'] else ' ',
        }

        if pos['position_end_date'] == 'present' or pos['position_end_date'] is None:
            result['position_end_date'] = datetime.now().date()
        elif pos['position_end_date'] == '':
            result['position_end_date'] = datetime.min.date()
        else:
            result['position_end_date'] = datetime.strptime(pos['position_end_date'], "%b %Y").date()
        
        if pos['position_start_date'] == '':
            result['position_start_date'] = datetime.min.date()
        elif pos['position_start_date'] is None:
            result['position_start_date'] = datetime.min.date()
        else:
            result['position_start_date'] = datetime.strptime(pos['position_start_date'], "%b %Y").date()
        
        embedding = await (
            get_embedding([
                result['position_company_name'], 
                result['position_location'], 
                result['position_summary'], 
                result['position_title']
            ])
        )

        result['position_company_name_embedding'] = embedding[0]
        result['position_location_embedding'] = embedding[1]
        result['position_summary_embedding'] = embedding[2]
        result['position_title_embedding'] = embedding[3]

        return result
    
    results = await asyncio.gather(*[process_single_position(pos) for pos in position])
    return results

def calc_company_name_similarity(target, similar):
    if target['position_company_name'].strip() == '' or similar['position_company_name'].strip() == '':
        return 1
    return np.dot(target['position_company_name_embedding'], similar['position_company_name_embedding'])

def calc_location_similarity(target, similar):
    if target['position_location'].strip() == '' or similar['position_location'].strip() == '':
        return 1
    return np.dot(target['position_location_embedding'], similar['position_location_embedding'])

def calc_summary_similarity(target, similar):
    if target['position_summary'].strip() == '' or similar['position_summary'].strip() == '':
        return 1
    if target['position_summary'] in similar['position_summary'] or similar['position_summary'] in target['position_summary']:
        return 1
    return np.dot(target['position_summary_embedding'], similar['position_summary_embedding'])

def calc_title_similarity(target, similar):
    if target['position_title'].strip() == '' or similar['position_title'].strip() == '':
        return 1
    return np.dot(target['position_title_embedding'], similar['position_title_embedding'])

def calc_pos_time_similarity(target, similar):
    required_fields = ['position_start_date', 'position_end_date']
    
    for field in required_fields:
        if field not in target or field not in similar:
            return 0
        
    if target['position_start_date'] == ' ' or target['position_end_date'] == ' ' or similar['position_start_date'] == ' ' or similar['position_end_date'] == ' ':
        return 0
    elif target['position_start_date'] <= similar['position_start_date']:
        if target['position_end_date'] >= similar['position_end_date']:
            return 1
        elif (similar['position_end_date'] - target['position_end_date']) <= timedelta(days = 90):
            return 0.7
    else:
        if similar['position_end_date'] >= target['position_end_date']:
            return 1
        elif (target['position_end_date'] - similar['position_end_date']) <= timedelta(days = 90):
            return 0.7
    return 0

def get_custom_judge(target, similar):
    score = int(calc_company_name_similarity(target, similar) >= 0.67) + int(calc_location_similarity(target, similar) >= 0.75) + int(calc_summary_similarity(target, similar) >= 0.7) + int(calc_title_similarity(target, similar) >= 0.68)
    return score >= 3

# deprecated
async def calculate_position_weighted(target_position, similar_position):

    target_position_list, similar_position_list = await asyncio.gather(
        get_position_info(target_position), 
        get_position_info(similar_position)
    )

    # Remove positions with no time stamp
    # target_position_list = [x for x in target_position_list if x['position_start_date'] is not None and x['position_end_date'] is not None]
    # similar_position_list = [x for x in similar_position_list if x['position_start_date'] is not None and x['position_end_date'] is not None]
    
    target_position_list.sort(key = lambda x: x['position_end_date'])
    similar_position_list.sort(key = lambda x: x['position_end_date'])

    # Using -1 to denote unvisited target position
    target_similar_mapping = {target_position['position_title']: -1 for target_position in target_position_list}

    threshold = min(math.ceil(0.5 * len(similar_position_list)), math.ceil(len(target_position_list) * 0.5))

    if not target_position_list or not similar_position_list:
        return 0

    print(f"target position number: {len(target_position_list)} similar position number: {len(similar_position_list)}")
    print(f"initial threshold: {threshold}")

    while target_position_list and similar_position_list:
        target = target_position_list[0]
        similar = similar_position_list[0]

        if target['position_start_date'] == ' ' or target['position_end_date'] == ' ':
            target_position_list.pop(0)
            print(f"target position has no time stamp, remaining {len(target_position_list)} target positions and {len(similar_position_list)} similar positions")
            continue
        elif similar['position_start_date'] == ' ' or similar['position_end_date'] == ' ':
            similar_position_list.pop(0)
            print(f"similar position has no time stamp, remaining {len(target_position_list)} target positions and {len(similar_position_list)} similar positions")
            continue
        elif target['position_start_date'] <= similar['position_start_date'] and target['position_end_date'] >= similar['position_end_date']:
            similarity_score = np.prod([calc_company_name_similarity(target, similar), calc_location_similarity(target, similar), calc_summary_similarity(target, similar), calc_title_similarity(target, similar)])

            print(f"position time: {target['position_start_date']} - {target['position_end_date']} vs.\n {similar['position_start_date']} - {similar['position_end_date']}\n company name: {target['position_company_name']} vs. \n {similar['position_company_name']} similarity: {calc_company_name_similarity(target, similar)}\n location: {target['position_location']} vs. \n {similar['position_location']} similarity: {calc_location_similarity(target, similar)}\n summary: {target['position_summary']} vs. \n {similar['position_summary']} similarity: {calc_summary_similarity(target, similar)}\n title: {target['position_title']} vs. \n {similar['position_title']} similarity: {calc_title_similarity(target, similar)}\n overall similarity: {similarity_score}\n")

            judge = get_custom_judge(target, similar)
            if judge:
                print("This is a good match")
                threshold -= 1
                print(f"threshold value: {threshold}")
            else:
                print("This does not match")

            target_similar_mapping[target['position_title']] = (similar['position_title'], similarity_score)

        elif similar['position_start_date'] <= target['position_start_date'] and similar['position_end_date'] >= target['position_end_date']:
            similarity_score = np.prod([calc_company_name_similarity(target, similar), calc_location_similarity(target, similar), calc_summary_similarity(target, similar), calc_title_similarity(target, similar)])

            print(f"position time: {target['position_start_date']} - {target['position_end_date']} vs.\n {similar['position_start_date']} - {similar['position_end_date']}\n company name: {target['position_company_name']} vs. \n {similar['position_company_name']} similarity: {calc_company_name_similarity(target, similar)}\n location: {target['position_location']} vs. \n {similar['position_location']} similarity: {calc_location_similarity(target, similar)}\n summary: {target['position_summary']} vs. \n {similar['position_summary']} similarity: {calc_summary_similarity(target, similar)}\n title: {target['position_title']} vs. \n {similar['position_title']} similarity: {calc_title_similarity(target, similar)}\n overall similarity: {similarity_score}\n")

            judge = get_custom_judge(target, similar)
            if judge:
                print("This is a good match")
                threshold -= 1
                print(f"threshold value: {threshold}")
            else:
                print("This does not match")

            target_similar_mapping[target['position_title']] = (similar['position_title'], similarity_score)
        else:
            print(f"position time: {target['position_start_date']} - {target['position_end_date']} vs.\n {similar['position_start_date']} - {similar['position_end_date']}")

        if target['position_end_date'] < similar['position_end_date']:
            target_position_list.pop(0)
            print(f"target position ends earlier, remaining {len(target_position_list)} target positions and {len(similar_position_list)} similar positions")
        else:
            similar_position_list.pop(0)
            print(f"similar position ends earlier, remaining {len(target_position_list)} target positions and {len(similar_position_list)} similar positions")
    
        # print("=========================================================================================================")
    
    if threshold <= 0:
        perfect_match_num += 1
        print("threshold met, perfect match")
    else:
        print(f"threshold not met, remaining {threshold} positions")

    total_match_num += 1

    # Add a overall similarity analysis for the target position
    for target in target_similar_mapping:
        if target_similar_mapping[target] == -1:
            print(f"target position: {target} has no similar position")
            
    print("****************************************** next target similar pair ***************************************************************")
    return target_similar_mapping

async def calculate_position_all_comb(target, similar):

    target_position_list, similar_position_list = await asyncio.gather(
        get_position_info(target), 
        get_position_info(similar)
    )

    if not target_position_list or not similar_position_list:
        return 0
    
    target_similar_mapping = {target_position['position_title']: -1 for target_position in target_position_list}

    threshold = min(math.ceil(0.5 * len(similar_position_list)), math.ceil(len(target_position_list) * 0.5))
    
    all_combinations = [(t, s) for t in target_position_list for s in similar_position_list]
    
    def calculate_combination_similarity(t, s):
        similarities = [
            calc_company_name_similarity(t, s),
            calc_location_similarity(t, s),
            calc_summary_similarity(t, s),
            calc_title_similarity(t, s),
            calc_pos_time_similarity(t, s)
        ]
        return t, s, np.sum(similarities), similarities
    
    combination_similarities = [calculate_combination_similarity(t, s) for t, s in all_combinations]
    
    while combination_similarities:
        (t_max, s_max, similarity, individual_similarities) = max(combination_similarities, key=lambda x: x[2])

        target_similar_mapping[t_max['position_title']] = (s_max['position_title'], similarity)
        combination_similarities = [(t, s, sim, inds) for t, s, sim, inds in combination_similarities if t != t_max and s != s_max]

        company_similarity = individual_similarities[0]
        location_similarity = individual_similarities[1]
        summary_similarity = individual_similarities[2]
        title_similarity = individual_similarities[3]
        time_similarity = individual_similarities[4]
     
        # print(f"company name: {t_max['position_company_name']} vs. {s_max['position_company_name']} similarity: {individual_similarities[0]}\n"
        #       f"location: {t_max['position_location']} vs. {s_max['position_location']} similarity: {individual_similarities[1]}\n"
        #       f"summary: {t_max['position_summary']} vs. {s_max['position_summary']} similarity: {individual_similarities[2]}\n"
        #       f"title: {t_max['position_title']} vs. {s_max['position_title']} similarity: {individual_similarities[3]}\n"
        #       f"time similarity: {individual_similarities[4]}\n"
        #       f"overall similarity: {similarity}\n")
        
        # total_similarity_score = sum(individual_similarities)

        # if total_similarity_score > 4:
        #     print("This is a perfect match!!")
        #     threshold -= 1
        # elif total_similarity_score > 3:
        #     print("This is a good match")
        #     threshold -= 0.5
    

    # if threshold <= 0:
    #     print("threshold met, perfect match")
    # else:
    #     print(f"threshold not met, remaining {threshold} positions")

    
    # for target in target_similar_mapping:
    #     if target_similar_mapping[target] == -1:
    #         print(f"target position: {target} has no similar position")
            
    print("****************************************** next target similar pair ***************************************************************")

    return target_similar_mapping

In [None]:
data = []
with open('export_profiles.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

print(data[0])
len(data)

In [None]:
# await asyncio.gather(*[get_profile_embeddings(profile['target']) for profile in data])

In [None]:
async def process(data, max_concurrent = 50):
    semaphore = asyncio.Semaphore(max_concurrent)

    async def sem_task(task):
        async with semaphore:
            return await task
        

    flattened_data_education = [(i['target']['education'], j['education']) for i in data for j in i['similar'].values()]
    flattened_data_position = [(i['target']['position'], j['position']) for i in data for j in i['similar'].values()]
    
    tasks_position = [sem_task(calculate_position_all_comb(target, similar)) for target, similar in flattened_data_position]
    tasks_education = [sem_task(calculate_similarity_weighted(target, similar)) for target, similar in flattened_data_education]

    results = await asyncio.gather(*tasks_position, *tasks_education)

    return results

In [None]:
# flattened_data_education = [(i['target']['education'], j['education']) for i in data for j in i['similar'].values()]
# flattened_data_position = [(i['target']['position'], j['position']) for i in data for j in i['similar'].values()]
# await asyncio.gather(*[get_weighted_embedding(target) for target, _ in flattened_data_education])

In [None]:
async def process_with_rate_limit(data, batch_size = 3000//8, delay = 60):
    flattened_data_education = [(i['target']['education'], j['education']) for i in data for j in i['similar'].values()]
    flattened_data_position = [(i['target']['position'], j['position']) for i in data for j in i['similar'].values()]
    total_batches = math.ceil(len(flattened_data_education) / batch_size)
    flattened_result = []
    for batch_num in range(total_batches):
        start = batch_num * batch_size
        end = min((batch_num + 1) * batch_size, len(flattened_data_education))
        batch_education = flattened_data_education[start:end]
        batch_position = flattened_data_position[start:end]
        flattened_result.extend(await asyncio.gather(*[calculate_position_all_comb(target, similar) for target, similar in batch_position]))
        flattened_result.extend(await asyncio.gather(*[calculate_similarity_weighted(target, similar) for target, similar in batch_education]))
        await asyncio.sleep(delay)
    return flattened_result

In [None]:
await process([d for d in data * 11])