In [2]:
import pandas as pd

In [56]:
# mentors
mentors_df = pd.read_csv('../data/raw/List of Mentors.csv')

In [57]:
# mentees
mentees_df = pd.read_csv('../data/raw/List of Mentees.csv')

In [58]:
# Created dictionaries for every row
def create_sentences(df):

    # Initialize an empty list to store dictionaries
    dict_list = []

    # Iterate over each row of the DataFrame
    for index, row in df.iterrows():
        # Convert the row to a dictionary and append to the list
        row_dict = dict(row)
        dict_list.append(row_dict)
    
    return dict_list

In [59]:
# generate sentence for mentors
mentor_dataset = create_sentences(mentors_df)

In [60]:
# generate sentences for mentees
mentee_dataset = create_sentences(mentees_df)

## Model

In [61]:
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [63]:
# Encode input text and get BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over tokens
    return embeddings

In [64]:
# Calculate cosine similarity between two vectors
def cosine_similarity_vec(vec1, vec2):
    return cosine_similarity(vec1.detach().numpy(), vec2.detach().numpy())

In [71]:
# Define weights for criteria
weights = {
    'Location': 0.6,
    
}

In [72]:
# get 1 mentor to test with
mentor = mentor_dataset[0]
mentor

{'Name': 'Maria Perez',
 'Summary': 'Structural engineer with over a decade of experience, specializing in space systems. My expertise revolves around ensuring the safety and integrity of these intricate structures, making me a crucial contributor to the world of space exploration.',
 'Location': 'Portland, OR'}

In [73]:
input_embedding = get_bert_embedding(mentor['Summary'])

In [74]:
adjusted_similarities = {}
for item in mentee_dataset:
    text = item['Summary']  # Extract text from the dictionary
    other_embedding = get_bert_embedding(text)
    similarity_score = cosine_similarity_vec(input_embedding, other_embedding)

    # Apply adjustments based on criteria
    adjusted_score = similarity_score
    for criterion, weight in weights.items():
        if criterion in mentor and mentor[criterion] == item.get(criterion):
            adjusted_score += weight * .1  # Adjust by a factor based on criterion importance

    adjusted_similarities[text] = adjusted_score  # Store similarity scores by text


In [75]:
# Get top N recommendations based on adjusted similarity scores
top_recommendations = sorted(adjusted_similarities.items(), key=lambda x: x[1], reverse=True)[:3]
print("Top recommendations:", top_recommendations)

Top recommendations: [("I'm Juniper Berry, an entry-level mechanical engineer hailing from West Virginia. My passion lies in space systems, and I'm eager to embark on a career dedicated to exploring the cosmos. With a strong educational background and a deep fascination for all things related to space, I'm determined to contribute to the exciting field of aerospace engineering and push the boundaries of human exploration.", array([[0.93900716]], dtype=float32)), ("I'm Francisco Clearwater, originally from Denver, Colorado. My academic background is in civil engineering, but my interests extend beyond the realm of engineering. I'm deeply intrigued by business management and its intricate dynamics. My journey is a fusion of analytical thinking and strategic vision, as I aim to bridge the gap between engineering expertise and effective business leadership.", array([[0.86439013]], dtype=float32)), ("I'm Marta Diaz, currently pursuing a cybersecurity program at Harvard University. My academ