In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load synthetic data
professionals_df = pd.read_csv('professionals.csv')
jobs_df = pd.read_csv('jobs.csv')

In [2]:
# Remove duplicates
professionals_df.drop_duplicates(inplace=True)
jobs_df.drop_duplicates(inplace=True)

In [3]:
professionals_df.head()

Unnamed: 0,id,skill,location,certification,education,skill_level,experience,work_preference
0,1,pharmacy,Chicago,"['NRP', 'ACLS']",Master,intermediate,11,full-time
1,2,physician,New York,"['ACLS', 'CPR', 'NRP']",r,entry-level,23,day shift
2,3,nursing,Houston,"['NRP', 'ACLS']",r,entry-level,11,full-time
3,4,pharmacy,Chicago,['ACLS'],r,advanced,17,day shift
4,5,pharmacy,Los Angeles,"['CPR', 'BLS']",r,intermediate,30,day shift


In [4]:
jobs_df.head()

Unnamed: 0,id,skill,location,certification_required,education_required,skill_level_required,experience_required,work_preference
0,1,pharmacy,New York,['PALS'],r,entry-level,6,day shift
1,2,pharmacy,Los Angeles,"['BLS', 'NRP', 'CPR']",r,entry-level,26,part-time
2,3,radiology,Houston,"['CPR', 'NRP', 'BLS']",r,intermediate,27,day shift
3,4,radiology,Houston,"['CPR', 'NRP']",r,intermediate,27,part-time
4,5,lab,Houston,['CPR'],r,advanced,3,part-time


In [5]:
jobs_df.columns

Index(['id', 'skill', 'location', 'certification_required',
       'education_required', 'skill_level_required', 'experience_required',
       'work_preference'],
      dtype='object')

In [6]:
# Handle missing values
professionals_df.fillna(value=np.nan, inplace=True)
jobs_df.fillna(value=np.nan, inplace=True)

In [7]:
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      500 non-null    int64 
 1   skill                   500 non-null    object
 2   location                500 non-null    object
 3   certification_required  500 non-null    object
 4   education_required      500 non-null    object
 5   skill_level_required    500 non-null    object
 6   experience_required     500 non-null    int64 
 7   work_preference         500 non-null    object
dtypes: int64(2), object(6)
memory usage: 35.2+ KB


In [8]:
professionals_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               1000 non-null   int64 
 1   skill            1000 non-null   object
 2   location         1000 non-null   object
 3   certification    1000 non-null   object
 4   education        1000 non-null   object
 5   skill_level      1000 non-null   object
 6   experience       1000 non-null   int64 
 7   work_preference  1000 non-null   object
dtypes: int64(2), object(6)
memory usage: 70.3+ KB


Here are some potential feature engineering ideas for matching healthcare professionals to job opportunities:

Composite Score: Create a composite score for each healthcare professional based on their skills and experience. This could be done by assigning weights to each skill and experience level, and then calculating a weighted sum.

Skill Clusters: Use unsupervised learning techniques, such as clustering, to group healthcare professionals based on their skills. This could help identify patterns and similarities between professionals, and aid in matching them to job opportunities.

Location Clusters: Use clustering algorithms to group job opportunities based on their location. This could help identify job opportunities that are in close proximity to each other and match healthcare professionals to multiple job opportunities.

Job-Professional Compatibility: Use supervised learning techniques, such as logistic regression, to create a model that predicts the compatibility of a healthcare professional with a job opportunity. This could be based on factors such as skills, experience, education, and work preferences.

Similarity Matching: Use similarity matching techniques, such as cosine similarity, to match healthcare professionals to job opportunities based on the similarity of their skills and experience.

In [9]:
professionals_df.columns

Index(['id', 'skill', 'location', 'certification', 'education', 'skill_level',
       'experience', 'work_preference'],
      dtype='object')

In [10]:
jobs_df.columns

Index(['id', 'skill', 'location', 'certification_required',
       'education_required', 'skill_level_required', 'experience_required',
       'work_preference'],
      dtype='object')

In [11]:
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      500 non-null    int64 
 1   skill                   500 non-null    object
 2   location                500 non-null    object
 3   certification_required  500 non-null    object
 4   education_required      500 non-null    object
 5   skill_level_required    500 non-null    object
 6   experience_required     500 non-null    int64 
 7   work_preference         500 non-null    object
dtypes: int64(2), object(6)
memory usage: 35.2+ KB


In [12]:
professionals_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               1000 non-null   int64 
 1   skill            1000 non-null   object
 2   location         1000 non-null   object
 3   certification    1000 non-null   object
 4   education        1000 non-null   object
 5   skill_level      1000 non-null   object
 6   experience       1000 non-null   int64 
 7   work_preference  1000 non-null   object
dtypes: int64(2), object(6)
memory usage: 70.3+ KB


In [13]:
# Merge the two data frames on the skill column
merged_df = pd.merge(professionals_df, jobs_df, on='skill', how='inner', suffixes=('_pro', '_job'))

# Preview the merged data frame
merged_df.head()

Unnamed: 0,id_pro,skill,location_pro,certification,education,skill_level,experience,work_preference_pro,id_job,location_job,certification_required,education_required,skill_level_required,experience_required,work_preference_job
0,1,pharmacy,Chicago,"['NRP', 'ACLS']",Master,intermediate,11,full-time,1,New York,['PALS'],r,entry-level,6,day shift
1,1,pharmacy,Chicago,"['NRP', 'ACLS']",Master,intermediate,11,full-time,2,Los Angeles,"['BLS', 'NRP', 'CPR']",r,entry-level,26,part-time
2,1,pharmacy,Chicago,"['NRP', 'ACLS']",Master,intermediate,11,full-time,11,Los Angeles,"['BLS', 'NRP']",r,entry-level,0,night shift
3,1,pharmacy,Chicago,"['NRP', 'ACLS']",Master,intermediate,11,full-time,14,Los Angeles,['CPR'],r,advanced,28,day shift
4,1,pharmacy,Chicago,"['NRP', 'ACLS']",Master,intermediate,11,full-time,15,Los Angeles,['CPR'],r,advanced,19,day shift


In [14]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99690 entries, 0 to 99689
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id_pro                  99690 non-null  int64 
 1   skill                   99690 non-null  object
 2   location_pro            99690 non-null  object
 3   certification           99690 non-null  object
 4   education               99690 non-null  object
 5   skill_level             99690 non-null  object
 6   experience              99690 non-null  int64 
 7   work_preference_pro     99690 non-null  object
 8   id_job                  99690 non-null  int64 
 9   location_job            99690 non-null  object
 10  certification_required  99690 non-null  object
 11  education_required      99690 non-null  object
 12  skill_level_required    99690 non-null  object
 13  experience_required     99690 non-null  int64 
 14  work_preference_job     99690 non-null  object
dtypes:

In [15]:
from sklearn.preprocessing import LabelEncoder

# Columns to encode
cols_to_encode = ['skill', 'location_pro', 'certification', 'education', 'skill_level', 'work_preference_pro']

# Create a LabelEncoder object
le = LabelEncoder()

# Encode the categorical columns
for col in cols_to_encode:
    merged_df[col] = le.fit_transform(merged_df[col])


In [16]:
# Preview the merged data frame
merged_df.head()

Unnamed: 0,id_pro,skill,location_pro,certification,education,skill_level,experience,work_preference_pro,id_job,location_job,certification_required,education_required,skill_level_required,experience_required,work_preference_job
0,1,2,0,54,0,2,11,1,1,New York,['PALS'],r,entry-level,6,day shift
1,1,2,0,54,0,2,11,1,2,Los Angeles,"['BLS', 'NRP', 'CPR']",r,entry-level,26,part-time
2,1,2,0,54,0,2,11,1,11,Los Angeles,"['BLS', 'NRP']",r,entry-level,0,night shift
3,1,2,0,54,0,2,11,1,14,Los Angeles,['CPR'],r,advanced,28,day shift
4,1,2,0,54,0,2,11,1,15,Los Angeles,['CPR'],r,advanced,19,day shift


In [17]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99690 entries, 0 to 99689
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id_pro                  99690 non-null  int64 
 1   skill                   99690 non-null  int64 
 2   location_pro            99690 non-null  int64 
 3   certification           99690 non-null  int64 
 4   education               99690 non-null  int64 
 5   skill_level             99690 non-null  int64 
 6   experience              99690 non-null  int64 
 7   work_preference_pro     99690 non-null  int64 
 8   id_job                  99690 non-null  int64 
 9   location_job            99690 non-null  object
 10  certification_required  99690 non-null  object
 11  education_required      99690 non-null  object
 12  skill_level_required    99690 non-null  object
 13  experience_required     99690 non-null  int64 
 14  work_preference_job     99690 non-null  object
dtypes:

In [18]:
# Create a user-item matrix
user_item_matrix = merged_df.pivot_table(index="id_pro", columns="id_job", values="experience_required", fill_value=0)

# Preview the user-item matrix
print(user_item_matrix.head())


id_job  1    2    3    4    5    6    7    8    9    10   ...  491  492  493  \
id_pro                                                    ...                  
1         6   26    0    0    0    0    0    0    0    0  ...    0    0    0   
2         0    0    0    0    0    0   20   13    0    0  ...    0    0   17   
3         0    0    0    0    0   29    0    0    0    0  ...   15    1    0   
4         6   26    0    0    0    0    0    0    0    0  ...    0    0    0   
5         6   26    0    0    0    0    0    0    0    0  ...    0    0    0   

id_job  494  495  496  497  498  499  500  
id_pro                                     
1         0    0    0    0   27    0    0  
2         0    0    0    0    0    0    0  
3         0    0    0    0    0   12    0  
4         0    0    0    0   27    0    0  
5         0    0    0    0   27    0    0  

[5 rows x 500 columns]


In [19]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
cosine_sim_matrix = cosine_similarity(user_item_matrix)

In [20]:
from sklearn.metrics.pairwise import pairwise_distances

# Compute the cosine similarity between the users based on their encoded features
user_similarity = 1 - pairwise_distances(user_item_matrix, metric='cosine')


In [21]:
# Transpose the user_item_matrix to create the item_user_matrix
item_user_matrix = user_item_matrix.T

In [22]:
item_user_matrix.head()

id_pro,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
id_job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,6,0,0,6,6,6,6,0,6,0,...,0,0,0,0,0,0,0,0,6,6
2,26,0,0,26,26,26,26,0,26,0,...,0,0,0,0,0,0,0,0,26,26
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,27,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,27,0,0
5,0,0,0,0,0,0,0,3,0,0,...,0,3,0,0,0,0,0,0,0,0


This function recommends top N jobs for a given user ID based on the collaborative filtering model. It takes four arguments:

user_id: the ID of the user for whom the recommendations are to be made.
user_item_matrix: the user-item matrix which contains the interaction scores between users and items.
item_user_matrix: the transposed matrix of the user-item matrix.
top_n: the number of top jobs to be recommended.
The function first gets the row index for the given user ID in the user-item matrix. It then retrieves the row corresponding to the user ID and calculates the similarity scores between the user and each job. The scores are sorted in descending order and the top N job IDs are selected. Finally, the job IDs are converted to integers and returned as the output of the function.

In [23]:
def recommend_jobs_for_user(user_id, user_item_matrix, item_user_matrix, top_n=10):
    """Recommend top N jobs for a given user ID based on the collaborative filtering model"""
    # Get the row index for the user ID
    user_index = user_item_matrix.index.get_loc(user_id)
    # Get the user-item similarity scores for the user
    user_scores = user_item_matrix.iloc[user_index,:]
    # Sort the scores in descending order
    sorted_scores = user_scores.sort_values(ascending=False)
    # Get the top N job IDs
    top_job_ids = sorted_scores.index[:top_n]
    # Convert the job IDs to integers
    top_job_ids = [int(job_id) for job_id in top_job_ids]
    # Return the top job IDs
    return top_job_ids

In [24]:
# Get a list of recommended job IDs for user with ID
recommended_job_ids = recommend_jobs_for_user(25, user_item_matrix, item_user_matrix, top_n=5)
print(recommended_job_ids)


[428, 164, 165, 359, 6]


In [25]:
# Define a function to get job titles from IDs
def get_job_titles(job_ids):
    # Load the dataset
    jobs_df = pd.read_csv('jobs.csv')

    # Filter the dataset to only include the desired job IDs
    recommended_jobs = jobs_df[jobs_df['id'].isin(job_ids)]

    # Extract the job titles from the filtered dataset
    job_titles = recommended_jobs[['id', 'skill', 'location', 'certification_required',
       'education_required', 'skill_level_required', 'experience_required',
       'work_preference']].values.tolist()

    return job_titles


# Get the job titles for the recommended job IDs
recommended_job_titles = get_job_titles(job_ids=recommended_job_ids)
print(pd.DataFrame(recommended_job_titles))

     0        1             2                 3  4             5   6  \
0    6  nursing  Philadelphia          ['ACLS']  r      advanced  29   
1  164  nursing   Los Angeles  ['ACLS', 'PALS']  r      advanced  30   
2  165  nursing       Houston           ['BLS']  r   entry-level  30   
3  359  nursing      New York    ['NRP', 'CPR']  r  intermediate  30   
4  428  nursing       Chicago    ['CPR', 'BLS']  r      advanced  30   

           7  
0  full-time  
1  part-time  
2  day shift  
3  full-time  
4  day shift  
