In [208]:
import pandas as pd
import numpy as np
import re

df = pd.read_csv('staff.am_data_2020.csv')

In [209]:
def clean_str(s, condense=False):
    if isinstance(s, str):
        s = re.sub('[^0-9a-zA-Z]+', ' ', s)
        if condense:
            s = ''.join(s.split())
        else:
            s = ' '.join(s.split())
        s = s.lower()
        return s
    return clean_str(str(s))

def condense_str(s):
    return clean_str(s, True)

def to_list(csl):
    if isinstance(csl, str):
        words = csl.split(',')
        return [condensed_str(word) for word in words]
    elif isinstance(csl, list):
        return csl
    return ''

# Clean strings
df['title'] = df['title'].apply(condense_str)
df['employment_terms'] = df['employment_terms'].apply(condense_str)
df['job_type'] = df['job_type'].apply(condense_str)
df['category'] = df['category'].apply(condense_str)
df['required_qualifications'] = df['required_qualifications'].apply(clean_str)
df['responsibilities'] = df['responsibilities'].apply(clean_str)
df['candidate_level'] = df['candidate_level'].apply(condense_str)

# Convert comma-separated lists into Python lists
df['soft_skills'] = df['soft_skills'].apply(to_list)
df['prof_skills'] = df['prof_skills'].apply(to_list)

# Convert to date
df['deadline'] = df['deadline'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d', errors='ignore'))

In [210]:
df.head(10)

Unnamed: 0,id,title,employment_terms,job_type,deadline,category,required_qualifications,responsibilities,soft_skills,prof_skills,salary,candidate_level
0,YB5jCG9jORj5PDq5DQOK3,salesconsultant,permanent,fulltime,2020-04-09,salesservicemanagement,,,"[positiveattitude, customerserviceoriented, fl...",[salesserviceexperience],,junior
1,Bt7vbu3oBdF6fNH5SvIm3,managerhostess,permanent,fulltime,2020-08-26,tourismhospitalityhoreca,female higher education in linguistics economi...,manage the daily operations ensure that all st...,,"[english, russianlanguage]",,midlevel
2,oH7bOT6HPYs4xnr7JOQL4,marketingspecialist,permanent,fulltime,2020-08-31,marketingadvertising,,,,[initiative],,senior
3,zK7cxx6TSIV4lLp9sFba8,projectcoordinator,permanent,fulltime,2020-08-31,productprojectmanagement,,,,[b2bsales],,senior
4,yg7GdF9DMxQ0YuO0bcOZ6,networkandsystemadministrator,permanent,fulltime,2020-09-06,otherit,higher technical nbsp education at least 2 yea...,configures and maintains microsoft active dire...,"[abilitytoworkindependently, problemsolving, t...","[linuxos, tcpip, activedirectory]",,midlevel
5,Hs1rHE1aFSn5awY0Nujv6hEW7,sfasdfa,permanent,fulltime,2020-08-26,softwaredevelopment,,,[artisticskills],,,junior
6,sw1WyC2deDc2Qsb9hZCS0Rbj6,iosandandroiddeveloper,permanent,fulltime,2020-07-31,softwaredevelopment,experience in developing native ios app s avai...,,,"[androidndk, iossdk, java, androidsdk]",,notdefined
7,hh1PkS3HjoI4KhT0XPCO8psb3,,permanent,fulltime,2020-08-28,networkadministration,proven experience as a system administrator ne...,manage network servers and technology tools se...,"[teamwork, senseofresponsibility, enterprising]",,,senior
8,Zi1VzG4lDLH7qYH6wVsa5xnP3,sdffe,permanent,fulltime,2020-08-18,qualityassurancecontrol,,,,[phpzendframework],,junior
9,FO1lSO5ZBtg3wdA3OnZQ3UUJ7,digitalproductowner,permanent,fulltime,2020-02-10,otherit,bachelor rsquo s degree in computer science bu...,take the lead of a scrum team as a technical p...,"[analyticalskills, teamwork, productoriented]",,,notdefined


In [211]:
def rep(s, t=1):
    return (s + ' ') * t

def create_soup(x):
    classes = rep(x['title']) + rep(x['employment_terms'], 1) + rep(x['job_type'], 1) + rep(x['category']) + rep(x['candidate_level'])
    description = x['required_qualifications'] + ' ' + x['responsibilities'] + ' ' + ' '.join(x['soft_skills']) + ' ' + ' '.join(x['prof_skills'])
    return classes + description

df['soup'] = df.apply(create_soup, axis=1)

df.iloc[0]['soup']

'salesconsultant permanent fulltime salesservicemanagement junior nan nan positiveattitude customerserviceoriented flexible detailoriented presentationskills teamwork salesserviceexperience'

In [212]:
# from sklearn.feature_extraction.text import CountVectorizer

# count = CountVectorizer(stop_words='english')
# count_mat = count.fit_transform(df['soup'])

# count_mat.shape

In [213]:
# count2 = CountVectorizer(vocabulary=count.vocabulary_.keys())
# count2_mat = count2.fit_transform([test_job['soup']])

# count2_mat.shape

In [214]:
# from sklearn.metrics.pairwise import cosine_similarity

# cosine_sim = cosine_similarity(count2_mat, count_mat)
# df = df.reset_index(drop=True)
# indices = pd.Series(df.index, index=df['id'])

# cosine_sim.shape

In [215]:
# sim_scores = list(enumerate(np.squeeze(np.asarray(cosine_sim))))
# sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# sim_scores = sim_scores[0:10]

# candidate_idxs = [i[0] for i in sim_scores]
# df.iloc[candidate_idxs]

In [216]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])
count_matrix.shape

(6422, 13630)

In [217]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

df = df.reset_index()
indices = pd.Series(df.index, index=df['id'])

def get_recommendations(id, cosine_sim=cosine_sim2):
    idx = indices[id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    job_indices = [i[0] for i in sim_scores]
    return df.iloc[job_indices]

get_recommendations('YB5jCG9jORj5PDq5DQOK3', cosine_sim2)

Unnamed: 0,index,id,title,employment_terms,job_type,deadline,category,required_qualifications,responsibilities,soft_skills,prof_skills,salary,candidate_level,soup
751,751,NT1pkG6GhZj9Vdq9BWnJ2LSL6,,permanent,fulltime,2020-01-31,salesservicemanagement,,,"[positiveattitude, timemanagement, customerser...",,,midlevel,permanent fulltime salesservicemanagement mid...
1050,1050,sG1WXX7heiT3jGO1FfbJ3ZDV8,,permanent,fulltime,2020-02-17,salesservicemanagement,,,"[positiveattitude, timemanagement, customerser...",,,midlevel,permanent fulltime salesservicemanagement mid...
1834,1834,Jo1PWW8lRRg1CCI5BrpW7Kug7,,permanent,fulltime,2020-03-08,salesservicemanagement,,,"[positiveattitude, timemanagement, customerser...",,,midlevel,permanent fulltime salesservicemanagement mid...
2070,2070,cR1Alw8sRrf4DFQ1bCRr1bTI5,idealsystemservicemanagerofijevanbranch,permanent,fulltime,2020-04-02,salesservicemanagement,,,"[customerserviceoriented, flexible, detailorie...",,,midlevel,idealsystemservicemanagerofijevanbranch perman...
2309,2309,If1jzW8UqFd6OXD6KkAl5GWg1,,permanent,fulltime,2020-03-23,salesservicemanagement,,,"[positiveattitude, timemanagement, customerser...",,,midlevel,permanent fulltime salesservicemanagement mid...
3918,3918,FI2eWQ5TnYc4iLQ4QgHT9QLe5,idealsystemservicemanagerofijevanbranch,permanent,fulltime,2020-07-01,salesservicemanagement,,,"[customerserviceoriented, flexible, detailorie...",,,midlevel,idealsystemservicemanagerofijevanbranch perman...
4631,4631,Ma2KjS6XrYA4IIi7uYFl8DNl3,idealsystemservicemanagerofkapanbranch,permanent,fulltime,2020-08-01,salesservicemanagement,,,"[customerserviceoriented, flexible, detailorie...",,,midlevel,idealsystemservicemanagerofkapanbranch permane...
5486,5486,Nw2gyW7cDGC6wqB3pLwA1yph7,idealsystemservicemanagerofkapanbranch,permanent,fulltime,2020-09-03,salesservicemanagement,,,"[customerserviceoriented, flexible, detailorie...",,,midlevel,idealsystemservicemanagerofkapanbranch permane...
4277,4277,SH2tjh6NGqz0cSd1QqnJ0lTi5,servicemanageridealsystemmasisbranch,permanent,fulltime,2020-07-17,salesservicemanagement,,,"[customerserviceoriented, flexible, detailorie...",[msoffice],,midlevel,servicemanageridealsystemmasisbranch permanent...
1105,1105,TI1Ozz7zItS3vPK7JKrU0HdQ5,salesandservicespecialistnightshift,permanent,fulltime,2020-02-29,salesservicemanagement,,,"[teamwork, communicationskills]",[salesserviceexperience],,junior,salesandservicespecialistnightshift permanent ...


Unnamed: 0,index,id,title,employment_terms,job_type,deadline,category,required_qualifications,responsibilities,soft_skills,prof_skills,salary,candidate_level,soup
0,0,YB5jCG9jORj5PDq5DQOK3,salesconsultant,permanent,fulltime,2020-04-09,salesservicemanagement,,,"[positiveattitude, customerserviceoriented, fl...",[salesserviceexperience],,junior,salesconsultant permanent fulltime salesservic...
