In [97]:
# Importing the necesary libraries

import pandas as pd
import googlemaps
from itertools import product
import os
import json
from dotenv import load_dotenv
from sklearn.preprocessing import MinMaxScaler
import logging

# Setting up logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

logging.info("Libraries imported successfully")

INFO: Libraries imported successfully


In [98]:
# Load environment variables
load_dotenv('../.env')
gmaps_api_key = os.getenv('GMAPS_API_KEY')

if not gmaps_api_key:
    logging.error("Google Maps API key not found. Please set it in the .env file.")
    raise ValueError("Google Maps API key not found")

# Initialize Google Maps client
gmaps = googlemaps.Client(key=gmaps_api_key)
logging.info("Initialized Google Maps")

INFO: API queries_quota: 60
INFO: Initialized Google Maps


In [99]:
# File paths
MENTOR_DATA = './data/mentor.csv'
MENTEE_DATA = './data/mentee.csv'

def load_data(file_path):
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        logging.error(f"Error loading {file_path}: {e}")
        return None

mentee_df = load_data(MENTEE_DATA)
mentor_df = load_data(MENTOR_DATA)

if mentee_df is None or mentor_df is None:
    raise ValueError("Error loading data files.")

logging.info("Files loaded successfully")
logging.info(f"Columns in mentee.csv:\n{mentee_df.columns}")
logging.info(f"Columns in mentor.csv:\n{mentor_df.columns}")

INFO: Files loaded successfully
INFO: Columns in mentee.csv:
Index(['Timestamp', 'First Name', 'Last Name', 'Phone Number', 'Email',
       'Year of Birth', 'Gender', 'Are you a person with Disability?',
       'County of Residence',
       'Sub County of Residence (Sub Counties in Mombasa)',
       'Ward of Residence', 'Highest level of education completed',
       'Are you employed', 'Are you running a business',
       'What's your area of interest?',
       'If you selected other, please elaborate (If not, respond with N/A)',
       'Who referred you? (Your Case Manager/Mentor)',
       'What are your Strengths', 'Mode of Mentorship',
       'Preferred Gender of your mentor', 'What are your weaknesses',
       'What are your career Goals',
       'What skills would you like to be mentored?',
       'What are your financial goals', 'Upload your resume if available'],
      dtype='object')
INFO: Columns in mentor.csv:
Index(['Timestamp', 'First Name', 'Last Name', 'Gender', 'Year of 

In [100]:
# Reading data from the loaded files

mentee_df = pd.read_csv(MENTEE_DATA)
mentor_df = pd.read_csv(MENTOR_DATA)

mentees = mentee_df.columns
mentors = mentor_df.columns

print("Reading data successfull")

Reading data successfull


In [101]:
print("Columns in mentee.csv:\n", mentees)
print("\n\nColumns in mentor.csv:\n", mentors)

Columns in mentee.csv:
 Index(['Timestamp', 'First Name', 'Last Name', 'Phone Number', 'Email',
       'Year of Birth', 'Gender', 'Are you a person with Disability?',
       'County of Residence',
       'Sub County of Residence (Sub Counties in Mombasa)',
       'Ward of Residence', 'Highest level of education completed',
       'Are you employed', 'Are you running a business',
       'What's your area of interest?',
       'If you selected other, please elaborate (If not, respond with N/A)',
       'Who referred you? (Your Case Manager/Mentor)',
       'What are your Strengths', 'Mode of Mentorship',
       'Preferred Gender of your mentor', 'What are your weaknesses',
       'What are your career Goals',
       'What skills would you like to be mentored?',
       'What are your financial goals', 'Upload your resume if available'],
      dtype='object')


Columns in mentor.csv:
 Index(['Timestamp', 'First Name', 'Last Name', 'Gender', 'Year of Birth',
       'Are you a person with di

In [102]:
# Extract unique sub-counties and ensure they are strings
mentee_sub_counties = mentee_df['Sub County of Residence (Sub Counties in Mombasa)'].dropna().astype(str).unique()
mentor_sub_counties = mentor_df['Residence Sub-County'].dropna().astype(str).unique()

In [103]:
# Geocode sub-counties
def geocode_sub_county(sub_county):
    geocode_result = gmaps.geocode(sub_county + ', Mombasa, Kenya')
    if geocode_result:
        location = geocode_result[0]['geometry']['location']
        return (location['lat'], location['lng'])
    return None

In [104]:
# Extract unique sub-counties and ensure they are strings
mentee_sub_counties = mentee_df['Sub County of Residence (Sub Counties in Mombasa)'].dropna().astype(str).unique()
mentor_sub_counties = mentor_df['Residence Sub-County'].dropna().astype(str).unique()

def geocode_sub_county(sub_county):
    try:
        geocode_result = gmaps.geocode(f"{sub_county}, Mombasa, Kenya")
        if geocode_result:
            location = geocode_result[0]['geometry']['location']
            return (location['lat'], location['lng'])
    except Exception as e:
        logging.error(f"Geocoding error for {sub_county}: {e}")
    return None

mentee_coords = {sub_county: geocode_sub_county(sub_county) for sub_county in mentee_sub_counties}
mentor_coords = {sub_county: geocode_sub_county(sub_county) for sub_county in mentor_sub_counties}

In [105]:
# Check if the files already exist
csv_file_path = './data/distances.csv'
json_file_path = './data/distances.json'

if not os.path.exists(csv_file_path) or not os.path.exists(json_file_path):
    distances = []
    for (mentee_sub, mentee_coord), (mentor_sub, mentor_coord) in product(mentee_coords.items(), mentor_coords.items()):
        if mentee_coord and mentor_coord:
            try:
                distance_result = gmaps.distance_matrix(mentee_coord, mentor_coord, mode='driving')
                if distance_result['rows'][0]['elements'][0]['status'] == 'OK':
                    distance = distance_result['rows'][0]['elements'][0]['distance']['value']
                    distances.append({
                        'Mentee Sub-County': mentee_sub,
                        'Mentor Sub-County': mentor_sub,
                        'Distance (meters)': distance
                    })
            except Exception as e:
                logging.error(f"Distance matrix error for {mentee_coord} to {mentor_coord}: {e}")

    distances_df = pd.DataFrame(distances)
    distances_df.to_csv(csv_file_path, index=False)
    distances_df.to_json(json_file_path, orient='records', indent=4)
    logging.info("Completed getting distances between sub-counties")
else:
    logging.info("Distance files already exist.")

INFO: Distance files already exist.


In [106]:
# Load precomputed distances
with open(json_file_path, 'r') as f:
    distances = json.load(f)
    
    
# Convert distances to DataFrame
distances_df = pd.DataFrame(distances)

In [107]:
# Filter out distances greater than 15 km
max_distance_meters = 15000  # 15 km in meters
distances_df = distances_df[distances_df['Distance (meters)'] <= max_distance_meters]

# Normalize distance for match score calculation
distances_df['Distance Score'] = 1 - (distances_df['Distance (meters)'] / max_distance_meters)  # Closer distance gets higher score

In [108]:
# Define a function to calculate the match score
def calculate_match_score(mentee, mentor, distance_score):
    score = 0
    if mentee['Preferred Gender of your mentor'] == mentor['Gender']:
        score += 20
    if isinstance(mentee['What\'s your area of interest?'], str) and isinstance(mentor['Your area of expertise'], str):
        if mentee['What\'s your area of interest?'] in mentor['Your area of expertise']:
            score += 30
    
    mentee_strengths = mentee['What are your Strengths']
    mentor_strengths = mentor['What are your strengths']
    if isinstance(mentee_strengths, str) and isinstance(mentor_strengths, str):
        if any(strength in mentor_strengths for strength in mentee_strengths.split(',')):
            score += 20
    
    mentee_weaknesses = mentee['What are your weaknesses']
    mentor_weaknesses = mentor['What are your weaknesses']
    if isinstance(mentee_weaknesses, str) and isinstance(mentor_weaknesses, str):
        if any(weakness in mentor_weaknesses for weakness in mentee_weaknesses.split(',')):
            score += 10
    
    score += distance_score * 20  # Closer distance gets higher score
    return score

In [109]:
# Calculate match scores for all possible mentor-mentee pairs
all_matches = []

for mentee_index, mentee_row in mentee_df.iterrows():
    for mentor_index, mentor_row in mentor_df.iterrows():
        distance_row = distances_df[(distances_df['Mentee Sub-County'] == mentee_row['Sub County of Residence (Sub Counties in Mombasa)']) &
                                    (distances_df['Mentor Sub-County'] == mentor_row['Residence Sub-County'])]
        if not distance_row.empty:
            distance_score = distance_row['Distance Score'].values[0]
            match_score = calculate_match_score(mentee_row, mentor_row, distance_score)
            all_matches.append({
                'Mentee Index': mentee_index,
                'Mentor Index': mentor_index,
                'Mentee Name': f"{mentee_row['First Name']} {mentee_row['Last Name']}",
                'Mentor Name': f"{mentor_row['First Name']} {mentor_row['Last Name']}",
                'Match Score': match_score
            })

In [110]:
# Convert all matches to DataFrame and sort by Match Score
all_matches_df = pd.DataFrame(all_matches).sort_values(by='Match Score', ascending=False)

# Save the DataFrame to CSV and JSON files
all_matches_df.to_csv('./data/all_matches.csv', index=False)
all_matches_df.to_json('./data/all_matches.json', orient='records', indent=4)

In [111]:
# Display DataFrame
print(all_matches_df)

        Mentee Index  Mentor Index      Mentee Name     Mentor Name  \
261168          3354            75  Said Mwakatsumi   Lucas Mwicigi   
252509          3243            47   David Mwambewa  Susan  Otieno    
257221          3304            36    Lewis  Irungu    Rukia  Omar    
259466          3332            75       Asha  Jama   Lucas Mwicigi   
265982          3416            75  Charity  Mbiti    Lucas Mwicigi   
...              ...           ...              ...             ...   
34014            443            58         Ali Fauz  Walter Ochieng   
34031            443            75         Ali Fauz   Lucas Mwicigi   
34032            443            76         Ali Fauz     Rohan Mutua   
34075            444            40     Mohammed Ali      Hamisi Ali   
28031            367            49      Birya Fondo   Lucas Mwicigi   

        Match Score  
261168    76.386667  
252509    70.000000  
257221    70.000000  
259466    70.000000  
265982    70.000000  
...            