# Job/Candidate Recommendation Model


## Preprocessing


In [1]:
import pandas as pd
import numpy as np
import json
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
jobs = pd.read_csv('data_jobs_prepared.csv')
users = pd.read_csv('data_resumes_prepared.csv')


In [3]:
jobs2 = pd.read_csv('data_jobs_prepared.csv', dtype={'ID':str})
jobs2.set_index('ID', inplace=True)
jobs2.drop('Description_clean', axis=1, inplace=True)


users2 = pd.read_csv('data_resumes_prepared.csv', dtype={'ID':str})
users2.set_index('ID', inplace=True)
users2.drop('Skills_clean', axis=1, inplace=True)

In [4]:
jobs.head()

Unnamed: 0,ID,Company,Position,URL,Location,Headquaters,Employees,Founded,Industry,Job_Description,Description_clean
0,1000,Visual BI Solutions Inc,Graduate Intern (Summer 2017) - SAP BI / Big D...,https://www.glassdoor.com/partner/jobListing.h...,"Plano, TX","Plano, TX",51 to 200 employees,2010,Information Technology,"Location: Plano, TX or Oklahoma City, OK Durat...",Location Plano TX or Oklahoma City OK Duration...
1,1001,Jobvertise,Digital Marketing Manager,https://www.glassdoor.com/partner/jobListing.h...,"Dallas, TX","Berlin, Germany",1 to 50 employees,2011,Unknown,The Digital Marketing Manager is the front lin...,The Digital Marketing Manager is the front lin...
2,1002,Santander Consumer USA,"Manager, Pricing Management Information Systems",https://www.glassdoor.com/partner/jobListing.h...,"Dallas, TX","Dallas, TX",5001 to 10000 employees,1995,Finance,Summary of Responsibilities:The Manager Pricin...,Summary of ResponsibilitiesThe Manager Pricing...
3,1003,Federal Reserve Bank of Dallas,Treasury Services Analyst Internship,https://www.glassdoor.com/partner/jobListing.h...,"Dallas, TX","Dallas, TX",1001 to 5000 employees,1914,Finance,ORGANIZATIONAL SUMMARY: As part of the natio...,ORGANIZATIONAL SUMMARY As part of the nation...
4,1004,Aviall,"Intern, Sales Analyst",https://www.glassdoor.com/partner/jobListing.h...,"Dallas, TX","Dallas, TX",1001 to 5000 employees,Boeing,Subsidiary or Business Segment,Aviall is the world's largest provider of new ...,Aviall is the worlds largest provider of new a...


In [5]:
users['Skills'][0]

'Programming language C, C++, Java  Oracle PeopleSoft  Internet Of Things  Machine Learning  Database Management System  Computer Networks  Operating System worked on Linux, Windows, Mac  Non  Technical Skills   Honest and HardWorking  Tolerant and Flexible to Different Situations  Polite and Calm  TeamPlayer'

In [6]:
indices_job = pd.Series(jobs.index, index=jobs['ID'])

indices_candidate = pd.Series(users.index, index=users['ID'])

In [7]:
indices_candidate[60]

58

## Vectorizing


In [8]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english', lowercase=True)
job_tfidf = tf.fit_transform(jobs['Description_clean'])
user_tfidf = tf.transform(users['Skills_clean'])

# tf.get_feature_names()
cos_similarity_list = [*map(lambda x: cosine_similarity(job_tfidf, x), user_tfidf)]

len(cos_similarity_list)

153

In [9]:
top = sorted(range(len(cos_similarity_list)), key=lambda i: cos_similarity_list[i].any(), reverse=True)[:100]

In [10]:
all_scores_user = []

for list in cos_similarity_list:
    _empty = []
    for element in list:
        _empty.append(element[0])
    all_scores_user.append(_empty)

## Job Recommendations for a candidate

In [11]:
def get_job_recommendation(top, df, scores):
    recommendation = pd.DataFrame(columns = ['ID', 'Position', 'Score'])
    count = 0
    for i in top:
        recommendation.at[count, 'ID'] = df["ID"][i]
        recommendation.at[count, 'Position'] = df['Position'][i]
        recommendation.at[count, 'Score'] =  scores[i]
        count += 1
    return recommendation

In [12]:
candidate = '85'
print("Top recommended jobs for {}".format(users.iloc[indices_candidate[int(candidate)]]['Name']))

df = get_job_recommendation(top, jobs, all_scores_user[int(indices_candidate[int(candidate)])]).sort_values(by=['Score'], ascending=False)[:10]
df

Top recommended jobs for Soumya Balan


Unnamed: 0,ID,Position,Score
90,1101,Senior Corporate Recruiter - Information Techn...,0.0593932
59,1069,Business Intelligence/SQL Developer,0.0587611
32,1038,Business Intelligence/SQL Developer,0.0587611
85,1096,Summer Intern,0.0503397
94,1105,Internship Opportunity,0.0385101
5,1006,Intern - Business Analytics,0.0341331
87,1098,Business Analyst Intern,0.0339665
33,1039,Financial Planning and Analysis Intern,0.0331658
60,1070,Financial Planning and Analysis Intern,0.0331658
6,1007,Intern - Business Analytics,0.0315852


## Candidate Recommendations for a job

In [13]:
all_scores_job = np.array(all_scores_user)
all_scores_job = all_scores_job.T.tolist()

In [14]:
def get_candidate_recommendation(top, df, scores):
#     recommendation = pd.DataFrame(columns = ['ID', 'Name', 'Location', 'Degree', 'Skills', 'Profile'])
    recommendation = pd.DataFrame(columns = ['ID', 'Name', 'Score'])
    count = 0
    for i in top:
        recommendation.at[count, 'ID'] = df['ID'][i]
        recommendation.at[count, 'Name'] = df['Name'][i]
#         recommendation.at[count, 'Location'] = df['Location'][i]
#         recommendation.at[count, 'Degree'] = df['Degree'][i]
#         recommendation.at[count, 'Skills'] = df['Skills'][i]
#         recommendation.at[count, 'Profile'] = df['Profile'][i]
        recommendation.at[count, 'Score'] =  scores[i]
        count += 1
    return recommendation

In [15]:
job_ID = '1001'
print("Top recommended candidates for {} postion".format((jobs.iloc[indices_job[int(job_ID)]]['Position'])))

df = get_candidate_recommendation(top, users, all_scores_job[indices_job[int(job_ID)]]).sort_values(by=['Score'], ascending=False)[:10]
df

Top recommended candidates for Digital Marketing Manager postion


Unnamed: 0,ID,Name,Score
61,73,Samyuktha Shivakumar,0.314854
52,60,Rajeev Kumar,0.0895637
80,93,Syam Devendla,0.0457377
3,4,Ananya Chavan,0.0422873
96,121,Aarti Pimplay,0.0266489
59,70,Sai Patha,0.0258224
20,25,Jay Madhavi,0.0252809
36,42,Manisha Bharti,0.0244065
14,19,Dushyant Bhatt,0.02316
49,57,Pulkit Saxena,0.0231025


## Jobs v Jobs


In [16]:
tf2 = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english', lowercase=True)
job_tfidf2 = tf2.fit_transform(jobs['Description_clean'])

# tf.get_feature_names()
cos_similarity_list2 = cosine_similarity(job_tfidf2, job_tfidf2)

In [17]:
top2 = sorted(range(len(cos_similarity_list2)), key=lambda i: cos_similarity_list2[i].any(), reverse=True)[:100]

all_scores2 = []

for listt in cos_similarity_list2:
    _empty = []
    for element in listt:
        _empty.append(element)
    all_scores2.append(_empty)

In [18]:
job_ID = '1011'
print("Top recommended candidates for {} postion".format((jobs.iloc[indices_job[int(job_ID)]]['Position'])))

df = get_job_recommendation(top, jobs, all_scores2[indices_job[int(job_ID)]]).sort_values(by=['Score'], ascending=False)[1:11]
df

Top recommended candidates for Data Scientist Intern postion


Unnamed: 0,ID,Position,Score
66,1076,Data Scientist - Intern,0.264516
39,1045,Data Scientist - Intern,0.264516
75,1085,Quantitative Analyst Intern,0.0709703
45,1051,Intern - Data Scientist,0.0677496
72,1082,Intern - Data Scientist,0.0677496
9,1010,"Summer Scholar Analyst - Business Technology, ...",0.0667405
25,1031,Intern - Data Scientist,0.0613397
54,1062,Intern - Data Scientist,0.0613397
91,1102,Data Science / Software Engineering Intern,0.0524861
35,1041,"Manager, Pricing Management Information Systems",0.0434433


## Candidate v Candidate 

In [19]:
tf3 = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english', lowercase=True)
user_tfidf2 = tf3.fit_transform(users['Skills_clean'])

cos_similarity_list3 = cosine_similarity(user_tfidf2, user_tfidf2)

In [20]:
top3 = sorted(range(len(cos_similarity_list3)), key=lambda i: cos_similarity_list3[i].any(), reverse=True)[:100]

all_scores3 = []

for listt in cos_similarity_list3:
    _empty = []
    for element in listt:
        _empty.append(element)
    all_scores3.append(_empty)

In [21]:
candidate = '85'
print("Top recommended jobs for {}".format(users.iloc[indices_candidate[int(candidate)]]['Name']))

df = get_candidate_recommendation(top3, users, all_scores3[indices_candidate[int(candidate)]]).sort_values(by=['Score'], ascending=False)[1:11]
df

Top recommended jobs for Soumya Balan


Unnamed: 0,ID,Name,Score
31,33,Kavya U.,0.461311
18,19,Dushyant Bhatt,0.220853
98,106,Fenil Francis,0.179241
97,105,Nida Khan,0.171491
42,44,Mohamed Ameen,0.154518
1,1,Afreen Jamadar,0.142183
43,45,Mohini Gupta,0.113519
39,41,Mahesh Vijay,0.102267
95,103,Anurag Asthana,0.0997943
56,58,Puneet Singh,0.0973819


## Return Result Dataset

### Response for recommended jobs for a candidate

In [22]:
def get_all_users():
    candidate_info = dict.fromkeys(users.ID.values.tolist())
    
    for i in users.ID.values.tolist():
        candidate_info[i] = json.loads(users2.loc[str(i)].to_json())

    for i in users.ID.values.tolist():
        temp_df = get_job_recommendation(top, jobs, all_scores_user[indices_candidate[int(i)]]).sort_values(by = ['Score'], ascending=False)[:10]
        candidate_info[i]["Jobs"] = json.loads(temp_df.to_json(orient="records"))

    return candidate_info

In [23]:
all_users_info = get_all_users()

### Response for recommended candidates for a job

In [24]:
def get_all_jobs():
    job_info = dict.fromkeys(jobs.ID.values.tolist())
    
    for i in jobs.ID.values.tolist():
        job_info[i] = json.loads(jobs2.loc[str(i)].to_json())

    for i in jobs.ID.values.tolist():
        temp_df = get_candidate_recommendation(top, users, all_scores_job[indices_job[int(i)]]).sort_values(by = ['Score'], ascending=False)[:10]
        job_info[i]["candidate_names"] = json.loads(temp_df.to_json(orient="records"))

    return job_info

In [25]:
all_job_info = get_all_jobs()

### Dataset with jobs information

In [26]:
left = pd.DataFrame.from_dict(all_job_info, orient='index')
left['Id'] = left.index
left.rename(columns={'Position': 'name', 'Location': 'address', 'Job_Description': 'job_description'}, inplace=True)
left1 = left[['Id', 'name', 'candidate_names', 'address', 'job_description']]

left1.head(10)

Unnamed: 0,Id,name,candidate_names,address,job_description
1000,1000,Graduate Intern (Summer 2017) - SAP BI / Big D...,"[{'ID': 68, 'Name': 'Roshan Sinha', 'Score': 0...","Plano, TX","Location: Plano, TX or Oklahoma City, OK Durat..."
1001,1001,Digital Marketing Manager,"[{'ID': 73, 'Name': 'Samyuktha Shivakumar', 'S...","Dallas, TX",The Digital Marketing Manager is the front lin...
1002,1002,"Manager, Pricing Management Information Systems","[{'ID': 121, 'Name': 'Aarti Pimplay', 'Score':...","Dallas, TX",Summary of Responsibilities:The Manager Pricin...
1003,1003,Treasury Services Analyst Internship,"[{'ID': 47, 'Name': 'Navjyot Singh Rathore', '...","Dallas, TX",ORGANIZATIONAL SUMMARY: As part of the natio...
1004,1004,"Intern, Sales Analyst","[{'ID': 121, 'Name': 'Aarti Pimplay', 'Score':...","Dallas, TX",Aviall is the world's largest provider of new ...
1005,1005,Human Resources Analyst Internship,"[{'ID': 56, 'Name': 'Prem Koshti', 'Score': 0....","Dallas, TX",ORGANIZATIONAL SUMMARY: As part of the natio...
1006,1006,Intern - Business Analytics,"[{'ID': 78, 'Name': 'Shaheen Unissa', 'Score':...","Dallas, TX",Atos is a leader in digital services with annu...
1007,1007,Intern - Business Analytics,"[{'ID': 78, 'Name': 'Shaheen Unissa', 'Score':...","Dallas, TX",Job Details Intern - Business Analytics US - T...
1008,1008,Digital Marketing Intern,"[{'ID': 73, 'Name': 'Samyuktha Shivakumar', 'S...","Dallas, TX",Job Descriptions:Downtown agency is looking fo...
1009,1009,Data Analyst - Intern,"[{'ID': 29, 'Name': 'Karthik GV', 'Score': 0.0...","Lewisville, TX",JOB TITLE Data Analyst - Intern JOB SUMMARY A...


### Dataset with candidate information

In [27]:
right = pd.DataFrame.from_dict(all_users_info, orient='index')
right['Id'] = right.index
right.rename(columns={'Name': 'name', 'Location': 'address', 'Skills': 'candidate_skills', 'Profile': 'resume_link', 'Jobs': 'job_name'}, inplace=True)
right1 = right[['Id', 'name', 'job_name', 'address', 'candidate_skills', 'resume_link']]

right1.tail(10)

Unnamed: 0,Id,name,job_name,address,candidate_skills,resume_link
201,201,Akash Gulhane,"[{'ID': 1104, 'Position': 'INTERN, Information...",Amravati,"Database MSAccess Other Hardware Networking, ...",indeed.com/r/Akash-Gulhane/8b86faac48268d09
209,209,Vinay Singhal,"[{'ID': 1094, 'Position': 'Manager Franchise D...",NewDelhi,Smart Working Fast and keen learner Works we...,indeed.com/r/Vinay-Singhal/c15261079a9b5ae7
210,210,Pawan Nag,"[{'ID': 1030, 'Position': 'Intern - Mainframe ...",Delhi,Operating Systems,indeed.com/r/Pawan-Nag/e14493f28cb72022
211,211,Shivam Sharma,"[{'ID': 1101, 'Position': 'Senior Corporate Re...",Ghaziabad,"SQL 1 year, HTML Less than 1 year, INCIDENT MA...",indeed.com/r/Shivam-Sharma/8e4755830666f3b6
212,212,Gaikwad Dilip,"[{'ID': 1069, 'Position': 'Business Intelligen...",Pune,TALLY MICROSOFT DYNAMIC OPARATER MICIT,
213,213,Moumita Mitra,"[{'ID': 1097, 'Position': 'Human Resources Int...",,"Computer Proficient in Windows, Word, Excel, O...",indeed.com/r/Moumita-Mitra/d63c4dc9837860db
214,214,Suman Biswas,"[{'ID': 1000, 'Position': 'Graduate Intern (Su...",Bengaluru,"SAP HANA 4 years, SAP UI5Fiori 4 years, Angula...",indeed.com/r/Suman-Biswas/63db95fe3ae14910
216,216,Anil Kumar,"[{'ID': 1040, 'Position': 'SAP Academy for Pre...",Delhi,"DATA BACKUP 1 year, EXCHANGE 1 year, LAN 1 yea...",indeed.com/r/Anil-Kumar/96983a9dd7222ae5
217,217,Siddharth Choudhary,"[{'ID': 1083, 'Position': 'INTERN, Accounting'...",Hyderabad,"AUDITING Less than 1 year, CFA Less than 1 yea...",indeed.com/r/Siddharth-Choudhary/19d56a964e37fa1a
218,218,Valarmathi Dhandapani,"[{'ID': 1079, 'Position': '2017 Fall Internshi...",Bengaluru,"Excel 10+ years, Operations 7 years, Project m...",indeed.com/r/Valarmathi-Dhandapani/a2b3eb34006...


### Final dataset after merging the two datasets above

In [28]:
df = pd.concat([right1, left1])
df

Unnamed: 0,Id,name,job_name,address,candidate_skills,resume_link,candidate_names,job_description
0,0,Abhishek Jha,"[{'ID': 1011, 'Position': 'Data Scientist Inte...",Bengaluru,"Programming language C, C++, Java Oracle Peop...",indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a,,
1,1,Afreen Jamadar,"[{'ID': 1010, 'Position': 'Summer Scholar Anal...",Sangli,"Database Less than 1 year, HTML Less than 1 ye...",indeed.com/r/Afreen-Jamadar/8baf379b705e37c6,,
2,2,Akhil Yadav Polemaina,"[{'ID': 1000, 'Position': 'Graduate Intern (Su...",Hyderabad,Teradata,indeed.com/r/Akhil-Yadav-Polemaina/f6931801c51...,,
3,3,Alok Khandai,"[{'ID': 1000, 'Position': 'Graduate Intern (Su...",Bengaluru,Operating Environment Windows9598XPNT Databa...,indeed.com/r/Alok-Khandai/5be849e443b8f467,,
4,4,Ananya Chavan,"[{'ID': 1008, 'Position': 'Digital Marketing I...",Mumbai,"EARCH ENGINE MARKETING 2 years, SEM 2 years, A...",indeed.com/r/Ananya-Chavan/738779ab71971a96,,
...,...,...,...,...,...,...,...,...
1152,1152,Business Analyst Intern,,"Irving, TX",,,"[{'ID': 18, 'Name': 'Dipesh Gulati', 'Score': ...",Real-world Experience. Life-long Connections. ...
1153,1153,"Intern, Marketing Analysis -",,"Grapevine, TX",,,"[{'ID': 62, 'Name': 'Ramesh HP', 'Score': 0.07...",The Internship Program Our paid internship pr...
1154,1154,"Summer Scholar, Business Analyst - Human Capit...",,"Dallas, TX",,,"[{'ID': 47, 'Name': 'Navjyot Singh Rathore', '...",Are you an analytical thinker with a passion f...
1155,1155,"Intern, Turf Marketing -",,"Grapevine, TX",,,"[{'ID': 73, 'Name': 'Samyuktha Shivakumar', 'S...",The Internship Program Our paid internship pr...


In [29]:
df_json = df.to_json(orient='records')

In [30]:
pip install pymongo[srv]

Note: you may need to restart the kernel to use updated packages.


In [31]:
import pymongo
from pymongo import MongoClient, InsertOne

client = pymongo.MongoClient("mongodb+srv://ebad:ebad371@cluster0.j22w0s7.mongodb.net/?retryWrites=true&w=majority")

# Database Name
db = client["resumes"]

# Collection Name
collection = db["sample_resumes2"]
requesting = []

for jsonObj in json.loads(df_json):
    requesting.append(InsertOne(jsonObj))

result = collection.bulk_write(requesting)
client.close()

### Save to Excel and CSV

In [32]:
df.to_excel("prepared_for_schema.xlsx", encoding='utf-8', index=False)
df.to_csv("prepared_for_schema.csv", encoding='utf-8', index=False)

### Register the Dataset to the Datastore

In [33]:
# from azureml.core import Workspace, Datastore, Dataset

# ws = Workspace.from_config()

# datastore = ws.get_default_datastore()

# # Register the dataset
# Dataset.Tabular.register_pandas_dataframe(
#         dataframe = df, 
#         name = 'job_resume_recommendation', 
#         target = datastore
#     )