In [115]:
import pandas as pd
import numpy as np

# Import Datas

In [63]:
mentee_availability = pd.read_csv("data_generate/mentee_availability.csv")
mentor_availability = pd.read_csv("data_generate/mentor_availability.csv")
mentee2mentor = pd.read_csv("data_generate/mentee2mentor.csv")

In [112]:
def generate_interest(num):
    # create a list of user ids
    user_ids = list(range(1, num+1))

    # create a dictionary to hold binary data for each variable
    data_dict = {}
    for i in range(7):
        data_dict[f"interest_{i+1}"] = np.random.randint(0, 2, num)

    # create a dataframe from the user ids and binary data dictionary
    df = pd.DataFrame({"id": user_ids, **data_dict})
    return df

# set the number of users
num_mentee = 100
num_mentor = 10

df_mentee = generate_interest(num_mentee)
df_mentor = generate_interest(num_mentor)

In [113]:
df_mentee.head()

Unnamed: 0,id,interest_1,interest_2,interest_3,interest_4,interest_5,interest_6,interest_7
0,1,1,1,0,1,0,1,0
1,2,1,0,0,0,1,1,1
2,3,0,1,0,1,1,1,0
3,4,1,1,0,0,0,0,0
4,5,1,1,1,0,1,1,1


In [114]:
df_mentor.head()

Unnamed: 0,id,interest_1,interest_2,interest_3,interest_4,interest_5,interest_6,interest_7
0,1,1,1,0,1,0,0,1
1,2,0,0,1,1,1,0,0
2,3,0,1,0,0,0,0,0
3,4,1,1,0,1,0,1,1
4,5,0,0,1,1,0,0,0


## Checking Data

In [64]:
mentee_availability.drop(columns=['Unnamed: 0'], inplace=True)
mentee_availability.head()

Unnamed: 0,User ID,Date,Day of Week,Start Hour,End Hour
0,1,2023-05-09,Tuesday,00:00:00,0 days 01:00:00
1,1,2023-05-09,Tuesday,01:00:00,0 days 02:00:00
2,1,2023-05-09,Tuesday,02:00:00,0 days 03:00:00
3,1,2023-05-09,Tuesday,03:00:00,0 days 04:00:00
4,1,2023-05-09,Tuesday,04:00:00,0 days 05:00:00


In [65]:
mentor_availability.drop(columns=['Unnamed: 0'], inplace=True)
mentor_availability.head()

Unnamed: 0,User ID,Date,Day of Week,Start Hour,End Hour
0,1,2023-05-09,Tuesday,00:00:00,0 days 01:00:00
1,1,2023-05-09,Tuesday,04:00:00,0 days 05:00:00
2,1,2023-05-09,Tuesday,06:00:00,0 days 07:00:00
3,1,2023-05-09,Tuesday,08:00:00,0 days 09:00:00
4,1,2023-05-09,Tuesday,12:00:00,0 days 13:00:00


In [66]:
mentee2mentor.drop(columns=['Unnamed: 0'], inplace=True)
mentee2mentor.head()

Unnamed: 0,user_id,mentor_id,similarity,rating
0,1,3,0.816497,3
1,1,1,0.408248,5
2,1,2,0.353553,5
3,1,5,0.316228,3
4,1,6,0.316228,2


# Filter mentor matching based on time availability

In [101]:
def mentor2mentee_time(list_menteeId):
    mentor2mentee_time = {}
    for menteeId in list_menteeId:
        mentee_id_curr = mentee_availability.loc[mentee_availability['User ID'] == menteeId,]
        
        matched = mentee_id_curr.merge(mentor_availability, on=['Date', 'Start Hour'], how='inner')
        list_matched_mentors = sorted(list(matched['User ID_y'].unique()))
        mentor2mentee_time[menteeId] = list_matched_mentors

    return mentor2mentee_time

In [106]:
list_menteeId = np.arange(1, 101)
mentor2mentee_dict = mentor2mentee_time(list_menteeId)
mentor2mentee_df = pd.DataFrame([(k, v) for k, values in mentor2mentee_dict.items() for v in values], columns=['user_id', 'mentor_id'])

In [107]:
mentor2mentee_df

Unnamed: 0,user_id,mentor_id
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
...,...,...
995,100,6
996,100,7
997,100,8
998,100,9


# Measure similiarity between mentee and mentor based on interest

In [105]:
def cosine_similarity(mentee, mentor):
    vec1 = mentee
    vec2 = mentor
    cosine = np.dot(vec1,vec2)/((np.dot(vec1,vec1)*np.dot(vec2,vec2))**0.5) 
    return cosine

In [121]:
def calculate_similarity(mentor2mentee_dict):
    sim_mentee = {}
    con = []
    var = [f"interest_{i+1}" for i in range(7)]
    menteeId = list(mentor2mentee_dict.keys())
    for mentee in menteeId:
        mentorId = mentor2mentee_dict[mentee]
        mentee_interest = df_mentee.loc[df_mentee.id==mentee,var].values.reshape(1,-1)
        mentee_interest = np.squeeze(np.asarray(mentee_interest))
        for mentor in mentorId:
            mentor_interest = df_mentor.loc[df_mentor.id==mentor,var].values.reshape(1,-1)
            mentor_interest = np.squeeze(np.asarray(mentor_interest))
            sim = cosine_similarity(mentee_interest, mentor_interest)
            con.append(sim)
        
        sim_mentee[mentee] = con
        con = []

    return sim_mentee

In [122]:
sim_mentee = calculate_similarity(mentor2mentee_dict)

In [123]:
sim_mentee

{1: [0.75,
  0.2886751345948129,
  0.5,
  0.8944271909999159,
  0.35355339059327373,
  0.35355339059327373,
  0.0,
  0.2886751345948129,
  0.5,
  0.8164965809277261],
 2: [0.5,
  0.2886751345948129,
  0.0,
  0.6708203932499369,
  0.0,
  0.7071067811865475,
  0.5773502691896258,
  0.2886751345948129,
  0.75,
  0.8164965809277261],
 3: [0.5,
  0.5773502691896258,
  0.5,
  0.6708203932499369,
  0.35355339059327373,
  0.35355339059327373,
  0.2886751345948129,
  0.2886751345948129,
  0.25,
  0.8164965809277261],
 4: [0.7071067811865475,
  0.0,
  0.7071067811865475,
  0.6324555320336759,
  0.0,
  0.0,
  0.0,
  0.0,
  0.35355339059327373,
  0.5773502691896258],
 5: [0.6123724356957946,
  0.47140452079103173,
  0.4082482904638631,
  0.7302967433402214,
  0.2886751345948129,
  0.5773502691896258,
  0.7071067811865476,
  0.47140452079103173,
  0.8164965809277261,
  0.8333333333333334],
 6: [0.5,
  0.0,
  0.0,
  0.4472135954999579,
  0.0,
  0.7071067811865475,
  0.5773502691896258,
  0.577350269

# Build DataFrame with matched mentor-mentees interest and its cosine similarity

In [128]:
def build_df(df):
    df['similarity'] = df['user_id'].copy()
    for row in range(df.shape[0]):
        #print(row)
        mentee = df.iloc[row,0]
        mentor = df.iloc[row,1]
        #print(user,mentor)
        df['similarity'][row] = sim_mentee[mentee][mentor-1]

    df_ = df.copy()
    df_['rating'] = df_['similarity'].copy()
    df_['rating'] = df_.rating.apply(lambda row: np.random.randint(1, 6))
    df_sorted_rating = df_.sort_values(by=['user_id','similarity','rating'],ascending=[True,False,False]).copy()
    df_sorted_rating.fillna(0,inplace=True)
    
    return df_sorted_rating

In [129]:
mentor2mentee_df = build_df(mentor2mentee_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['similarity'][row] = sim_mentee[mentee][mentor-1]


In [130]:
mentor2mentee_df

Unnamed: 0,user_id,mentor_id,similarity,rating
3,1,4,0.894427,4
9,1,10,0.816497,1
0,1,1,0.750000,4
8,1,9,0.500000,5
2,1,3,0.500000,1
...,...,...,...,...
994,100,5,0.632456,3
993,100,4,0.600000,4
998,100,9,0.447214,4
992,100,3,0.447214,1
