## Recommender System for Matching HealthCare Professionals with Jobs Using Cosine Similarity

### Importing the relevant packages

In [2]:
import pandas as pd
import numpy as np
import random #for generating random numbers

### Generating a synthetic data.
In this data, we will use syntheric data with 2000 professionals and 500 different jobs, using the random package

In [3]:
# Defining the parameters to randomize the synthetic data
num_professionals = 2000
num_jobs = 500

In [4]:
# Defining the variables 
skills = ['nursing', 'physician', 'radiology', 'pharmacy', 'lab']
locations = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Philadelphia']
certifications = ['BLS', 'ACLS', 'PALS', 'CPR', 'NRP']
education = ['Associate', 'Bachelor', 'Master', 'Doctorate']
work_preferences = ['part-time', 'full-time', 'day shift', 'night shift']
skill_levels = ['entry-level', 'intermediate', 'advanced']
max_experience_years = 30

In [5]:
# Generating synnthetic data for the job healthcare professionals 
professionals_df = pd.DataFrame(columns=['id', 'skills', 'location', 'certification', 'education', 'skill_level', 'work_preference', 'experience' ])
for i in range(num_professionals):
    id = i + 1,
    skills = random.choice(skills),
    location = random.choice(locations),
    certification = random.sample(certifications, random.randint(1, len(certifications))),
    education = random.choice(education),
    work_preference = random.choice(work_preferences),
    experience = random.randint(1, max_experience_years),
    skill_level = random.choice(skill_levels),
    professionals_df.loc[i] = [id, skills, location, certification, education, skill_level, work_preference, experience]
professionals_df.sample(5)
    
    

Unnamed: 0,id,skills,location,certification,education,skill_level,work_preference,experience
554,"(555,)","(physician,)","(New York,)","([BLS, ACLS, PALS],)","(Master,)","(entry-level,)","(part-time,)","(3,)"
1026,"(1027,)","(physician,)","(Chicago,)","([CPR, BLS, ACLS, PALS],)","(Master,)","(intermediate,)","(full-time,)","(8,)"
947,"(948,)","(physician,)","(Philadelphia,)","([NRP, CPR, BLS, PALS],)","(Master,)","(entry-level,)","(part-time,)","(18,)"
1041,"(1042,)","(physician,)","(New York,)","([PALS, ACLS, BLS],)","(Master,)","(entry-level,)","(part-time,)","(25,)"
123,"(124,)","(physician,)","(Los Angeles,)","([CPR, PALS],)","(Master,)","(intermediate,)","(full-time,)","(4,)"


The dataset looks ugly with the commas and the parentheses. Let us create a function to remove them

In [6]:
def clean_tuple(t):
    return str(t).replace("(", "").replace(",", "").replace(")", "").replace("'", "").replace("[", "").replace("]", "")
professionals_df = professionals_df.applymap(clean_tuple)
professionals_df.sample(5)

Unnamed: 0,id,skills,location,certification,education,skill_level,work_preference,experience
1630,1631,physician,Chicago,PALS NRP BLS CPR ACLS,Master,intermediate,part-time,1
1685,1686,physician,Chicago,BLS CPR,Master,entry-level,night shift,3
851,852,physician,New York,CPR BLS,Master,advanced,part-time,26
973,974,physician,Los Angeles,CPR NRP,Master,advanced,part-time,4
396,397,physician,Chicago,PALS NRP,Master,intermediate,night shift,4


The dataset looks cleaner, so let us to the same for the jobs dataframe

In [7]:
jobs_df = pd.DataFrame(columns=['id', 'skills', 'location', 'certification_required', 'education_required', 'skill_level_required', 'work_preference', 'experience_required' ])
for i in range(num_jobs):
    id = i + 1,
    skills = random.choice(skills),
    location = random.choice(locations),
    certification = random.sample(certifications, random.randint(1, len(certifications))),
    education = random.choice(education),
    work_preference = random.choice(work_preferences),
    experience = random.randint(1, max_experience_years),
    skill_level = random.choice(skill_levels),
    jobs_df.loc[i] = [id, skills, location, certification, education, skill_level, work_preference, experience]
    
jobs_df = jobs_df.applymap(clean_tuple)
jobs_df.sample(5)
    

Unnamed: 0,id,skills,location,certification_required,education_required,skill_level_required,work_preference,experience_required
155,156,physician,Chicago,CPR PALS,Master,advanced,night shift,3
136,137,physician,New York,ACLS CPR,Master,intermediate,full-time,28
87,88,physician,New York,NRP,Master,intermediate,night shift,7
210,211,physician,New York,CPR NRP ACLS BLS PALS,Master,advanced,part-time,4
116,117,physician,Los Angeles,ACLS NRP PALS CPR BLS,Master,advanced,part-time,28


In [8]:
jobs_df.shape, professionals_df.shape

((500, 8), (2000, 8))

In [9]:
jobs_df.to_csv('jobs.csv', index=False)
professionals_df.to_csv('professionals.csv', index=False)

### Exploratory Data Analysis
Now that we have two datasets, let us now proceed to explore it

In [10]:
professionals_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               2000 non-null   object
 1   skills           2000 non-null   object
 2   location         2000 non-null   object
 3   certification    2000 non-null   object
 4   education        2000 non-null   object
 5   skill_level      2000 non-null   object
 6   work_preference  2000 non-null   object
 7   experience       2000 non-null   object
dtypes: object(8)
memory usage: 140.6+ KB


We need the id and the experience to be integers as seen in our dataframe


In [11]:
professionals_df.id = professionals_df.id.astype('int64')
professionals_df.experience = professionals_df.experience.astype('int64')
professionals_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               2000 non-null   int64 
 1   skills           2000 non-null   object
 2   location         2000 non-null   object
 3   certification    2000 non-null   object
 4   education        2000 non-null   object
 5   skill_level      2000 non-null   object
 6   work_preference  2000 non-null   object
 7   experience       2000 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 140.6+ KB


Let us now investigate the data types of the jobs variables

In [12]:
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      500 non-null    object
 1   skills                  500 non-null    object
 2   location                500 non-null    object
 3   certification_required  500 non-null    object
 4   education_required      500 non-null    object
 5   skill_level_required    500 non-null    object
 6   work_preference         500 non-null    object
 7   experience_required     500 non-null    object
dtypes: object(8)
memory usage: 35.2+ KB


Similarly, we need the experience and the id to be integers 

In [13]:
jobs_df.id = jobs_df.id.astype('int64')
jobs_df.experience_required = jobs_df.experience_required.astype('int64')
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      500 non-null    int64 
 1   skills                  500 non-null    object
 2   location                500 non-null    object
 3   certification_required  500 non-null    object
 4   education_required      500 non-null    object
 5   skill_level_required    500 non-null    object
 6   work_preference         500 non-null    object
 7   experience_required     500 non-null    int64 
dtypes: int64(2), object(6)
memory usage: 35.2+ KB


### Feature Engineering
We will perform similarity matching to match the healthcare professionals to job opportunities based on the similarity of their skills and experience

In [14]:
jobs_df.sample(4)

Unnamed: 0,id,skills,location,certification_required,education_required,skill_level_required,work_preference,experience_required
254,255,physician,New York,BLS NRP CPR ACLS PALS,Master,entry-level,full-time,24
59,60,physician,Los Angeles,ACLS PALS BLS CPR NRP,Master,advanced,day shift,18
303,304,physician,Chicago,NRP PALS ACLS CPR BLS,Master,entry-level,part-time,4
116,117,physician,Los Angeles,ACLS NRP PALS CPR BLS,Master,advanced,part-time,28


In [15]:
professionals_df.sample(4)

Unnamed: 0,id,skills,location,certification,education,skill_level,work_preference,experience
924,925,physician,Chicago,CPR BLS PALS ACLS NRP,Master,entry-level,full-time,30
1804,1805,physician,Los Angeles,ACLS CPR NRP PALS BLS,Master,advanced,day shift,22
533,534,physician,Los Angeles,BLS NRP PALS CPR,Master,entry-level,night shift,14
322,323,physician,Philadelphia,ACLS BLS CPR,Master,advanced,part-time,2


We will match the two dataframes based on the skills variable

In [16]:
merged_df = pd.merge(professionals_df, jobs_df, on='skills', how='inner', suffixes=('_pro', '_job'))
merged_df.sample(4)

Unnamed: 0,id_pro,skills,location_pro,certification,education,skill_level,work_preference_pro,experience,id_job,location_job,certification_required,education_required,skill_level_required,work_preference_job,experience_required
106811,214,physician,New York,BLS ACLS CPR,Master,intermediate,full-time,27,312,Chicago,NRP PALS,Master,entry-level,part-time,23
393274,787,physician,Los Angeles,PALS,Master,advanced,full-time,14,275,Philadelphia,ACLS CPR,Master,entry-level,day shift,19
757551,1516,physician,Chicago,ACLS BLS,Master,intermediate,full-time,25,52,Philadelphia,PALS CPR ACLS NRP,Master,entry-level,full-time,18
470264,941,physician,Chicago,NRP BLS CPR ACLS,Master,entry-level,night shift,17,265,Houston,ACLS NRP,Master,intermediate,part-time,18


We have 15 different variables and a million rows

In [17]:
merged_df.shape, merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 15 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   id_pro                  1000000 non-null  int64 
 1   skills                  1000000 non-null  object
 2   location_pro            1000000 non-null  object
 3   certification           1000000 non-null  object
 4   education               1000000 non-null  object
 5   skill_level             1000000 non-null  object
 6   work_preference_pro     1000000 non-null  object
 7   experience              1000000 non-null  int64 
 8   id_job                  1000000 non-null  int64 
 9   location_job            1000000 non-null  object
 10  certification_required  1000000 non-null  object
 11  education_required      1000000 non-null  object
 12  skill_level_required    1000000 non-null  object
 13  work_preference_job     1000000 non-null  object
 14  experience_required

((1000000, 15), None)

### User-item Matrix
We will create a user-item matrix. But before then, we will have to perform some encoding operations on some of out variables since the user-item matrix is essentially a matrix of numerical values. 
In our case, we will perform one-hot encoding on columns like skill, location_pro, certification, education, skill_level, and work_preference_pro columns.

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
# selecting the clumns to encode 
encode_cols = ['skills', 'location_pro', 'certification', 'education', 'skill_level', 'work_preference_pro']

# Creating a LabelEncoder object 
label = LabelEncoder()

# Encoding the categorical columns 
for col in encode_cols:
    merged_df[col] = label.fit_transform(merged_df[col])

# Previewing the merged DataFrame 
merged_df.sample(5)


Unnamed: 0,id_pro,skills,location_pro,certification,education,skill_level,work_preference_pro,experience,id_job,location_job,certification_required,education_required,skill_level_required,work_preference_job,experience_required
243358,487,0,1,187,0,2,2,29,359,Los Angeles,NRP BLS,Master,advanced,night shift,8
774103,1549,0,4,287,0,0,1,7,104,Chicago,PALS NRP,Master,intermediate,full-time,13
859146,1719,0,3,102,0,0,0,16,147,Los Angeles,CPR BLS ACLS PALS,Master,entry-level,part-time,2
16078,33,0,3,206,0,0,1,17,79,Philadelphia,NRP,Master,entry-level,day shift,28
776782,1554,0,2,116,0,1,3,8,283,Houston,ACLS PALS BLS,Master,intermediate,full-time,25


In [20]:
# Summarizing the dataFrame 
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 15 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   id_pro                  1000000 non-null  int64 
 1   skills                  1000000 non-null  int64 
 2   location_pro            1000000 non-null  int64 
 3   certification           1000000 non-null  int64 
 4   education               1000000 non-null  int64 
 5   skill_level             1000000 non-null  int64 
 6   work_preference_pro     1000000 non-null  int64 
 7   experience              1000000 non-null  int64 
 8   id_job                  1000000 non-null  int64 
 9   location_job            1000000 non-null  object
 10  certification_required  1000000 non-null  object
 11  education_required      1000000 non-null  object
 12  skill_level_required    1000000 non-null  object
 13  work_preference_job     1000000 non-null  object
 14  experience_required

We can now build a user-item matrix, where each row represents a profession and each colum a job that we will use to input recommender systems like collaborative filtering and content-based filtering 

In [21]:
# Creating a user-item matrix
user_item_matrix = merged_df.pivot_table(index='id_pro', columns='id_job', values='experience_required', fill_value=0)
user_item_matrix.head(5)

id_job,1,2,3,4,5,6,7,8,9,10,...,491,492,493,494,495,496,497,498,499,500
id_pro,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2,19,28,11,15,24,15,21,29,29,...,8,10,23,18,29,15,9,29,10,5
2,2,19,28,11,15,24,15,21,29,29,...,8,10,23,18,29,15,9,29,10,5
3,2,19,28,11,15,24,15,21,29,29,...,8,10,23,18,29,15,9,29,10,5
4,2,19,28,11,15,24,15,21,29,29,...,8,10,23,18,29,15,9,29,10,5
5,2,19,28,11,15,24,15,21,29,29,...,8,10,23,18,29,15,9,29,10,5


Here, we created a pivot table with the id_pro as the rows, the id_job as the columns and the experience_required as the values. This basically represents the interactions between users (id_pro) and items (id_job) based on the experience_required value.

Next, we will use Cosine Similarity to compute the similarity between the rows of the user-item matrix. It will determine how similar users or items are to each other based on their preferences or attributes. We will then use pairwise distance to perform the cosine distance between the rows before comverting them to cosine similarities. 

In [23]:
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances

# Computing the cosine similarity matrix 
cos_sim_matrix = cosine_similarity(user_item_matrix)

# Compiting the cosine similarity of the users based on their encoded features 
user_similarity = 1-pairwise_distances(user_item_matrix, metric='cosine')


We will then transporse our matrix, so that we can look for similar items instead of similar users. Instead of being a "user-item matrix", it is essentially a "item-user matrix". In our case, instead of fiding similar professionals for a given job, we will find similar jobs for a given professional

In [25]:
# Transpose a user_item_matrix to get a item_user_matrix
item_user_matrix = user_item_matrix.T
item_user_matrix

id_pro,1,2,3,4,5,6,7,8,9,10,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000
id_job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
2,19,19,19,19,19,19,19,19,19,19,...,19,19,19,19,19,19,19,19,19,19
3,28,28,28,28,28,28,28,28,28,28,...,28,28,28,28,28,28,28,28,28,28
4,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
5,15,15,15,15,15,15,15,15,15,15,...,15,15,15,15,15,15,15,15,15,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,15,15,15,15,15,15,15,15,15,15,...,15,15,15,15,15,15,15,15,15,15
497,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
498,29,29,29,29,29,29,29,29,29,29,...,29,29,29,29,29,29,29,29,29,29
499,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
