In [13]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import os
from glob import glob
from datetime import date


In [14]:
model = SentenceTransformer("/Users/k33988/Downloads/all-MiniLM-L6-v2")

In [15]:

def load_excel_file(file):
    return pd.read_excel(file)

def load_previous_matches(csv_files_path):
    all_files = glob(os.path.join(csv_files_path, "*.csv"))
    filtered_files = [f for f in all_files if not os.path.basename(f).startswith("deduplicated")]

    data_frames = pd.concat((pd.read_csv(f) for f in filtered_files))
    
    return data_frames
    
def load_matches(csv_files_path):

    """
    Loads and processes match data from output CSV file currently from 2023
    
    Args: 
        csv_file_path(str): The path to the directory containing the CSV files. 
    
    Returns: 
        dict: A dictionary where the key are email addresses 
        and the values are sets of email addresses that the key has been matched with
    """
    matches_df = load_previous_matches(csv_files_path)
    
    
    #Initiates and Iterates through the rows of the combined DataFrame
    previous_matches = {}
    for index, row in matches_df.iterrows():
        person = row['Email Address']
        matched_with = row['Matched Email Address']

        if person not in previous_matches: 
            previous_matches[person] = set()
        if matched_with not in previous_matches:
            previous_matches[matched_with] = set()

        previous_matches[person].add(matched_with)
        previous_matches[matched_with].add(person)
        
    #converts emails to all lowercase to be detected as the same email (there were some that were capitalized)
    previous_matches = {key.lower():{email.lower() for email in value} for key, value in previous_matches.items()}
    return previous_matches



def generate_pool(data,freq_column,start_date):
    today = date.today()
    #number of months passed
    curr_round_num = (today.year - start_date.year)* 12 + (today.month - start_date.month)
    mapping = {'Monthly':1,'Bi-monthly':2,'Quarterly':3}

    data_pool = data.copy()
    data_pool[freq_column + '_std'] = data_pool[freq_column].map(mapping)
    #includes or excludes people
    data_pool['pool_fl'] = data_pool[freq_column + '_std'].apply(lambda x: 1 if curr_round_num % x ==0 else 0)
    data_pool = data_pool[data_pool['pool_fl'] == 1]

    return data_pool

def get_embedding(text):
    return model.encode(text)

In [16]:

input_file = './input/2024-07-01-profiles.xlsx'
previous_pairs_file = './output'
#managers_file = '/Users/k33988/Documents/summer-coffee-hour/exclude-managers/manager-email.xlsx'

data = load_excel_file(input_file)
#managers = load_excel_file(managers_file)
previous_matches = load_matches(previous_pairs_file)



attribute = ['Professional Interests', 'Hobbies', 'Topics you would like to learn more about']
start_date = date(2024, 7, 1)


data_pool = generate_pool(data, 'Match Frequency', start_date)

In [17]:
PI = data_pool['Professional Interests'].to_list()
PI_Embeddings = get_embedding(PI)
PI_Similarity = cosine_similarity(PI_Embeddings)



In [18]:
H = data_pool['Hobbies'].to_list()
Hobbies_Embed = get_embedding(H)
Hobbies_Sim = cosine_similarity(Hobbies_Embed)

In [19]:
Topics = data_pool['Topics you would like to learn more about'].to_list()
Topics_Embed = get_embedding(Topics)
Topics_Sim = cosine_similarity(Topics_Embed)

In [20]:
np.asanyarray([PI_Similarity, Hobbies_Sim, Topics_Sim]).max(axis=0)

array([[1.0000004 , 0.50442284, 0.4820521 , ..., 0.4904822 , 0.6038989 ,
        0.7264508 ],
       [0.50442284, 1.0000004 , 0.33953324, ..., 0.44313973, 0.58423495,
        0.45557576],
       [0.4820521 , 0.33953324, 1.0000002 , ..., 0.37820992, 0.4147362 ,
        0.6447514 ],
       ...,
       [0.4904822 , 0.44313973, 0.37820992, ..., 1.0000001 , 0.3958416 ,
        0.38216138],
       [0.6038989 , 0.58423495, 0.4147362 , ..., 0.3958416 , 1.0000002 ,
        0.51787233],
       [0.7264508 , 0.45557576, 0.6447514 , ..., 0.38216138, 0.51787233,
        1.0000001 ]], dtype=float32)

In [62]:
#calculate sim matrix function
similarities = {}
sim_matrix = []
for x in attribute:
    texts = data_pool[x].to_list()
    embeddings = get_embedding(texts)
    similarity_matrix = cosine_similarity(embeddings)
    sim_matrix.append(similarity_matrix)
    
sim_matrix_3d= np.stack(sim_matrix, axis = 0)

for i in range(sim_matrix_3d.shape[0]):
    np.fill_diagonal(sim_matrix_3d[i], 0)
    

    #return similarities
    #calculate_similarity(data_pool, attributes)
print(sim_matrix)
print(similarities)

[array([[1.0000004 , 0.44062972, 0.14472461, ..., 0.4904822 , 0.5437184 ,
        0.22060227],
       [0.44062972, 1.0000004 , 0.23613757, ..., 0.44313973, 0.48146343,
        0.23805241],
       [0.14472461, 0.23613757, 1.0000001 , ..., 0.24837118, 0.22481975,
        0.15653038],
       ...,
       [0.4904822 , 0.44313973, 0.24837118, ..., 0.99999976, 0.2673173 ,
        0.08114752],
       [0.5437184 , 0.48146343, 0.22481975, ..., 0.2673173 , 1.0000002 ,
        0.28564093],
       [0.22060227, 0.23805241, 0.15653038, ..., 0.08114752, 0.28564093,
        1.0000001 ]], dtype=float32), array([[1.        , 0.50442284, 0.4820521 , ..., 0.4257974 , 0.6038989 ,
        0.7264508 ],
       [0.50442284, 1.0000001 , 0.33953324, ..., 0.29081073, 0.58423495,
        0.45557576],
       [0.4820521 , 0.33953324, 1.        , ..., 0.37820992, 0.320565  ,
        0.6447514 ],
       ...,
       [0.4257974 , 0.29081073, 0.37820992, ..., 1.        , 0.3958416 ,
        0.38216138],
       [0.6038989 

In [64]:
#calculate sim function with sim_matrix as input along with employees, attribute data_pool

all_people = data_pool.copy()
all_people = all_people.fillna('')
all_people = all_people[all_people['Active Flag'].str.lower() == 'active']

pairs = []
already_matched = set(all_people['Email Address'])
employee = all_people['Email Address'].to_list()
employee_email = employee['Email Address'].lower()
previous_matched_people = previous_matches[employee['Email Address'].lower() ]
unmatched_people = already_matched - previous_matched_people
unmatched_people = all_people[all_people['Email Address'].isin(unmatched_people)]


    #matched_text[x]

# aggregate_similarity = np.asanyarray(list(similarities.values())).max(axis=0)
# similarities['Aggregate'] = aggregate_similarity

#best_match stays the same?

TypeError: list indices must be integers or slices, not str

In [63]:
#Index of first employee to find match for
employee_idx = all_people.index[all_people['Email Address'].str.lower() == employee[0]]

#Extract the row from sim_matrix corresponding to this employee
employee_similarities = sim_matrix_3d[:,employee_idx, :]
aggregate_similarity = np.sum(employee_similarities, axis=0)

#index with highest score
best_candidate_idx = np.argmax(aggregate_similarity)
best_candidate_email = all_people.iloc[best_candidate_idx]['Email Address']



print(employee_email)
print(best_candidate_email)


brent.kostkowski@finra.org
shawn.murray@finra.org


In [27]:
print(idx2)

70
