## Recommender System for Matching HealthCare Professionals with Jobs Using Cosine Similarity

### Importing the relevant packages

In [1]:
import pandas as pd
import numpy as np
import random #for generating random numbers

### Generating a synthetic data.
In this data, we will use syntheric data with 2000 professionals and 500 different jobs, using the random package

In [2]:
# Defining the parameters to randomize the synthetic data
num_professionals = 2000
num_jobs = 500

In [3]:
# Defining the variables 
skills = ['nursing', 'physician', 'radiology', 'pharmacy', 'lab']
locations = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Philadelphia']
certifications = ['BLS', 'ACLS', 'PALS', 'CPR', 'NRP']
education = ['Associate', 'Bachelor', 'Master', 'Doctorate']
work_preferences = ['part-time', 'full-time', 'day shift', 'night shift']
skill_levels = ['entry-level', 'intermediate', 'advanced']
max_experience_years = 30

In [4]:
# Generating synnthetic data for the job healthcare professionals 
professionals_df = pd.DataFrame(columns=['id', 'skills', 'location', 'certification', 'education', 'skill_level', 'work_preference', 'experience' ])
for i in range(num_professionals):
    id = i + 1,
    skills = random.choice(skills),
    location = random.choice(locations),
    certification = random.sample(certifications, random.randint(1, len(certifications))),
    education = random.choice(education),
    work_preference = random.choice(work_preferences),
    experience = random.randint(1, max_experience_years),
    skill_level = random.choice(skill_levels),
    professionals_df.loc[i] = [id, skills, location, certification, education, skill_level, work_preference, experience]
professionals_df.sample(5)
    
    

Unnamed: 0,id,skills,location,certification,education,skill_level,work_preference,experience
368,"(369,)","(physician,)","(Chicago,)","([ACLS, CPR, PALS],)","(Master,)","(entry-level,)","(night shift,)","(19,)"
441,"(442,)","(physician,)","(Philadelphia,)","([ACLS],)","(Master,)","(intermediate,)","(part-time,)","(14,)"
1585,"(1586,)","(physician,)","(Philadelphia,)","([CPR, ACLS, PALS, NRP, BLS],)","(Master,)","(advanced,)","(night shift,)","(27,)"
28,"(29,)","(physician,)","(New York,)","([ACLS, CPR],)","(Master,)","(intermediate,)","(day shift,)","(10,)"
1563,"(1564,)","(physician,)","(New York,)","([CPR, BLS],)","(Master,)","(entry-level,)","(full-time,)","(5,)"


The dataset looks ugly with the commas and the parentheses. Let us create a function to remove them

In [5]:
def clean_tuple(t):
    return str(t).replace("(", "").replace(",", "").replace(")", "").replace("'", "").replace("[", "").replace("]", "")
professionals_df = professionals_df.applymap(clean_tuple)
professionals_df.sample(5)

Unnamed: 0,id,skills,location,certification,education,skill_level,work_preference,experience
847,848,physician,Houston,BLS ACLS CPR NRP,Master,entry-level,full-time,15
1872,1873,physician,Houston,NRP CPR BLS ACLS,Master,entry-level,part-time,16
1102,1103,physician,Philadelphia,PALS NRP CPR BLS ACLS,Master,intermediate,full-time,10
324,325,physician,Philadelphia,CPR ACLS BLS PALS NRP,Master,advanced,part-time,22
1741,1742,physician,Los Angeles,NRP CPR BLS PALS ACLS,Master,advanced,full-time,12


The dataset looks cleaner, so let us to the same for the jobs dataframe

In [6]:
jobs_df = pd.DataFrame(columns=['id', 'skills', 'location', 'certification_required', 'education_required', 'skill_level_required', 'work_preference', 'experience_required' ])
for i in range(num_jobs):
    id = i + 1,
    skills = random.choice(skills),
    location = random.choice(locations),
    certification = random.sample(certifications, random.randint(1, len(certifications))),
    education = random.choice(education),
    work_preference = random.choice(work_preferences),
    experience = random.randint(1, max_experience_years),
    skill_level = random.choice(skill_levels),
    jobs_df.loc[i] = [id, skills, location, certification, education, skill_level, work_preference, experience]
    
jobs_df = jobs_df.applymap(clean_tuple)
jobs_df.sample(5)
    

Unnamed: 0,id,skills,location,certification_required,education_required,skill_level_required,work_preference,experience_required
215,216,physician,Houston,PALS NRP ACLS CPR BLS,Master,advanced,full-time,16
167,168,physician,New York,NRP,Master,intermediate,night shift,19
221,222,physician,Houston,BLS PALS,Master,advanced,night shift,12
183,184,physician,Los Angeles,ACLS CPR NRP PALS,Master,advanced,full-time,3
319,320,physician,Houston,PALS ACLS BLS CPR NRP,Master,entry-level,night shift,9


In [7]:
jobs_df.shape, professionals_df.shape

((500, 8), (2000, 8))

In [8]:
jobs_df.to_csv('jobs.csv', index=False)
professionals_df.to_csv('professionals.csv', index=False)

### Exploratory Data Analysis
Now that we have two datasets, let us now proceed to explore it

In [9]:
professionals_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               2000 non-null   object
 1   skills           2000 non-null   object
 2   location         2000 non-null   object
 3   certification    2000 non-null   object
 4   education        2000 non-null   object
 5   skill_level      2000 non-null   object
 6   work_preference  2000 non-null   object
 7   experience       2000 non-null   object
dtypes: object(8)
memory usage: 140.6+ KB


We need the id and the experience to be integers as seen in our dataframe


In [10]:
professionals_df.id = professionals_df.id.astype('int64')
professionals_df.experience = professionals_df.experience.astype('int64')
professionals_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               2000 non-null   int64 
 1   skills           2000 non-null   object
 2   location         2000 non-null   object
 3   certification    2000 non-null   object
 4   education        2000 non-null   object
 5   skill_level      2000 non-null   object
 6   work_preference  2000 non-null   object
 7   experience       2000 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 140.6+ KB


Let us now investigate the data types of the jobs variables

In [11]:
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      500 non-null    object
 1   skills                  500 non-null    object
 2   location                500 non-null    object
 3   certification_required  500 non-null    object
 4   education_required      500 non-null    object
 5   skill_level_required    500 non-null    object
 6   work_preference         500 non-null    object
 7   experience_required     500 non-null    object
dtypes: object(8)
memory usage: 35.2+ KB


Similarly, we need the experience and the id to be integers 

In [12]:
jobs_df.id = jobs_df.id.astype('int64')
jobs_df.experience_required = jobs_df.experience_required.astype('int64')
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      500 non-null    int64 
 1   skills                  500 non-null    object
 2   location                500 non-null    object
 3   certification_required  500 non-null    object
 4   education_required      500 non-null    object
 5   skill_level_required    500 non-null    object
 6   work_preference         500 non-null    object
 7   experience_required     500 non-null    int64 
dtypes: int64(2), object(6)
memory usage: 35.2+ KB


### Feature Engineering
We will perform similarity matching to match the healthcare professionals to job opportunities based on the similarity of their skills and experience

In [13]:
jobs_df.sample(4)

Unnamed: 0,id,skills,location,certification_required,education_required,skill_level_required,work_preference,experience_required
303,304,physician,New York,ACLS NRP BLS CPR PALS,Master,advanced,night shift,16
78,79,physician,New York,BLS PALS,Master,entry-level,day shift,14
66,67,physician,Philadelphia,ACLS NRP CPR PALS BLS,Master,advanced,full-time,29
378,379,physician,New York,ACLS,Master,entry-level,day shift,12


In [14]:
professionals_df.sample(4)

Unnamed: 0,id,skills,location,certification,education,skill_level,work_preference,experience
405,406,physician,Los Angeles,NRP,Master,advanced,night shift,22
1195,1196,physician,Chicago,ACLS BLS PALS CPR NRP,Master,entry-level,full-time,16
674,675,physician,New York,PALS ACLS BLS NRP,Master,advanced,part-time,28
422,423,physician,Los Angeles,PALS,Master,entry-level,day shift,18


We will match the two dataframes based on the skills variable

In [15]:
merged_df = pd.merge(professionals_df, jobs_df, on='skills', how='inner', suffixes=('_pro', '_job'))
merged_df.sample(4)

Unnamed: 0,id_pro,skills,location_pro,certification,education,skill_level,work_preference_pro,experience,id_job,location_job,certification_required,education_required,skill_level_required,work_preference_job,experience_required
556541,1114,physician,Houston,ACLS PALS BLS,Master,intermediate,part-time,20,42,Chicago,CPR NRP BLS ACLS PALS,Master,entry-level,day shift,27
784445,1569,physician,Philadelphia,ACLS,Master,advanced,day shift,20,446,Houston,CPR ACLS PALS NRP BLS,Master,entry-level,part-time,2
278878,558,physician,Chicago,NRP BLS ACLS PALS CPR,Master,intermediate,part-time,11,379,New York,ACLS,Master,entry-level,day shift,12
814737,1630,physician,Philadelphia,BLS CPR ACLS,Master,advanced,night shift,20,238,Los Angeles,PALS BLS NRP,Master,advanced,part-time,5


We have 15 different variables and a million rows

In [16]:
merged_df.shape, merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 15 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   id_pro                  1000000 non-null  int64 
 1   skills                  1000000 non-null  object
 2   location_pro            1000000 non-null  object
 3   certification           1000000 non-null  object
 4   education               1000000 non-null  object
 5   skill_level             1000000 non-null  object
 6   work_preference_pro     1000000 non-null  object
 7   experience              1000000 non-null  int64 
 8   id_job                  1000000 non-null  int64 
 9   location_job            1000000 non-null  object
 10  certification_required  1000000 non-null  object
 11  education_required      1000000 non-null  object
 12  skill_level_required    1000000 non-null  object
 13  work_preference_job     1000000 non-null  object
 14  experience_required

((1000000, 15), None)

### User-item Matrix
We will create a user-item matrix. But before then, we will have to perform some encoding operations on some of out variables since the user-item matrix is essentially a matrix of numerical values. 
In our case, we will perform one-hot encoding on columns like skill, location_pro, certification, education, skill_level, and work_preference_pro columns.

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
# selecting the clumns to encode 
encode_cols = ['skills', 'location_pro', 'certification', 'education', 'skill_level', 'work_preference_pro']

# Creating a LabelEncoder object 
label = LabelEncoder()

# Encoding the categorical columns 
for col in encode_cols:
    merged_df[col] = label.fit_transform(merged_df[col])

# Previewing the merged DataFrame 
merged_df.sample(5)


Unnamed: 0,id_pro,skills,location_pro,certification,education,skill_level,work_preference_pro,experience,id_job,location_job,certification_required,education_required,skill_level_required,work_preference_job,experience_required
755011,1511,0,4,145,0,2,3,10,12,Houston,BLS CPR PALS ACLS,Master,intermediate,full-time,15
847383,1695,0,1,262,0,0,3,3,384,Chicago,ACLS BLS,Master,advanced,full-time,15
316515,634,0,0,64,0,1,0,14,16,Chicago,NRP ACLS BLS PALS CPR,Master,advanced,day shift,12
513849,1028,0,1,104,0,2,1,19,350,New York,BLS PALS,Master,intermediate,night shift,12
1199,3,0,1,178,0,0,0,27,200,Chicago,ACLS CPR PALS NRP BLS,Master,intermediate,part-time,22


In [19]:
# Summarizing the dataFrame 
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 15 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   id_pro                  1000000 non-null  int64 
 1   skills                  1000000 non-null  int64 
 2   location_pro            1000000 non-null  int64 
 3   certification           1000000 non-null  int64 
 4   education               1000000 non-null  int64 
 5   skill_level             1000000 non-null  int64 
 6   work_preference_pro     1000000 non-null  int64 
 7   experience              1000000 non-null  int64 
 8   id_job                  1000000 non-null  int64 
 9   location_job            1000000 non-null  object
 10  certification_required  1000000 non-null  object
 11  education_required      1000000 non-null  object
 12  skill_level_required    1000000 non-null  object
 13  work_preference_job     1000000 non-null  object
 14  experience_required

We can now build a user-item matrix, where each row represents a profession and each colum a job that we will use to input recommender systems like collaborative filtering and content-based filtering 

In [20]:
# Creating a user-item matrix
user_item_matrix = merged_df.pivot_table(index='id_pro', columns='id_job', values='experience_required', fill_value=0)
user_item_matrix.head(5)

id_job,1,2,3,4,5,6,7,8,9,10,...,491,492,493,494,495,496,497,498,499,500
id_pro,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,17,9,30,23,22,7,17,2,3,1,...,17,24,1,22,25,2,18,24,20,1
2,17,9,30,23,22,7,17,2,3,1,...,17,24,1,22,25,2,18,24,20,1
3,17,9,30,23,22,7,17,2,3,1,...,17,24,1,22,25,2,18,24,20,1
4,17,9,30,23,22,7,17,2,3,1,...,17,24,1,22,25,2,18,24,20,1
5,17,9,30,23,22,7,17,2,3,1,...,17,24,1,22,25,2,18,24,20,1


Here, we created a pivot table with the id_pro as the rows, the id_job as the columns and the experience_required as the values. This basically represents the interactions between users (id_pro) and items (id_job) based on the experience_required value.

Next, we will use Cosine Similarity to compute the similarity between the rows of the user-item matrix. It will determine how similar users or items are to each other based on their preferences or attributes. We will then use pairwise distance to perform the cosine distance between the rows before comverting them to cosine similarities. 

In [21]:
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances

# Computing the cosine similarity matrix 
cos_sim_matrix = cosine_similarity(user_item_matrix)

# Compiting the cosine similarity of the users based on their encoded features 
user_similarity = 1-pairwise_distances(user_item_matrix, metric='cosine')


We will then transporse our matrix, so that we can look for similar items instead of similar users. Instead of being a "user-item matrix", it is essentially a "item-user matrix". In our case, instead of finding similar professionals for a given job, we will find similar jobs for a given professional

In [22]:
# Transpose a user_item_matrix to get a item_user_matrix
item_user_matrix = user_item_matrix.T
item_user_matrix

id_pro,1,2,3,4,5,6,7,8,9,10,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000
id_job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,17,17,17,17,17,17,17,17,17,17,...,17,17,17,17,17,17,17,17,17,17
2,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
3,30,30,30,30,30,30,30,30,30,30,...,30,30,30,30,30,30,30,30,30,30
4,23,23,23,23,23,23,23,23,23,23,...,23,23,23,23,23,23,23,23,23,23
5,22,22,22,22,22,22,22,22,22,22,...,22,22,22,22,22,22,22,22,22,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
497,18,18,18,18,18,18,18,18,18,18,...,18,18,18,18,18,18,18,18,18,18
498,24,24,24,24,24,24,24,24,24,24,...,24,24,24,24,24,24,24,24,24,24
499,20,20,20,20,20,20,20,20,20,20,...,20,20,20,20,20,20,20,20,20,20


### Recommending jobs
Next, we will create a function that recommends top N jobs for a give user based on the collaborative filtering model. it will take 4 arguments: 
user_id
user_item_matrix
item_user_matrix
top_n 

In [23]:
def recommend_jobs(user_id, user_item_matrix, item_user_matrix, top_n=20):
    """Recommend top N jobs for a healthcare professional given the user ID based on the collaborative model """
    # Get row index for the user ID 
    user_index = user_item_matrix.index.get_loc(user_id)
    # Get user-item similarity scores for the user
    user_scores = user_item_matrix.iloc[user_index, :]
    # Sort the scores in ascending order 
    sorted_scores = user_scores.sort_values(ascending=False)
    # Get the top N job IDs 
    top_job_ids = sorted_scores.index[:top_n]
    # Convert the job IDs to integers 
    top_job_ids = [int(job_id) for job_id in top_job_ids]
    # Return the top job IDs 
    return top_job_ids

# Get a list of the recommended job IDs for users with ID 
recommended_job_ids = recommend_jobs(25, user_item_matrix, item_user_matrix, top_n=5)
print(recommended_job_ids)
    

[236, 3, 205, 370, 26]


We then have to create another function fo get the job titles from the IDs 

In [24]:
# Defining a function to get job titles from IDs 
def get_job_titles(job_ids):
    # Load the dataset 
    jobs_df = pd.read_csv('jobs.csv')    
    # Filter the dataset to only include the desired job IDs 
    recommended_jobs = jobs_df[jobs_df.id.isin(job_ids)]
    # Extract job titles from the filtered dataset 
    job_titles = recommended_jobs[['id', 'skills', 'location', 'certification_required',
       'education_required', 'skill_level_required', 'experience_required',
       'work_preference']].values.tolist()
    return job_titles
# Get job titles for the recommended job IDs 
recommended_job_titles = get_job_titles(job_ids=recommended_job_ids)
recommended_job_titles = pd.DataFrame(recommended_job_titles)
recommended_job_titles

Unnamed: 0,0,1,2,3,4,5,6,7
0,3,physician,Los Angeles,PALS NRP BLS,Master,intermediate,30,night shift
1,26,physician,Philadelphia,BLS CPR ACLS PALS,Master,advanced,30,night shift
2,205,physician,New York,CPR BLS ACLS PALS,Master,intermediate,30,night shift
3,236,physician,Houston,NRP CPR,Master,advanced,30,full-time
4,370,physician,New York,NRP PALS ACLS CPR,Master,intermediate,30,part-time


Next, we will create a simple program in flask to accept input from html form and push it into a SQL server databse 