In [150]:
import pandas as pd
import numpy as np
import re

df = pd.read_csv('staff.am_data_2020.csv')

In [151]:
def clean_str(s, condense=False):
    if isinstance(s, str):
        s = re.sub('[^0-9a-zA-Z]+', ' ', s)
        if condense:
            s = ''.join(s.split())
        else:
            s = ' '.join(s.split())
        s = s.lower()
        return s
    return clean_str(str(s))

def condense_str(s):
    return clean_str(s, True)

def to_list(csl):
    if isinstance(csl, str):
        words = csl.split(',')
        return [condensed_str(word) for word in words]
    elif isinstance(csl, list):
        return csl
    return ''

# Clean strings
df['title'] = df['title'].apply(condense_str)
df['employment_terms'] = df['employment_terms'].apply(condense_str)
df['job_type'] = df['job_type'].apply(condense_str)
df['category'] = df['category'].apply(condense_str)
df['required_qualifications'] = df['required_qualifications'].apply(clean_str)
df['responsibilities'] = df['responsibilities'].apply(clean_str)
df['candidate_level'] = df['candidate_level'].apply(condense_str)

# Convert comma-separated lists into Python lists
df['soft_skills'] = df['soft_skills'].apply(to_list)
df['prof_skills'] = df['prof_skills'].apply(to_list)

# Convert to date
df['deadline'] = df['deadline'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d', errors='ignore'))

In [152]:
df.head(10)

Unnamed: 0,id,title,employment_terms,job_type,deadline,category,required_qualifications,responsibilities,soft_skills,prof_skills,salary,candidate_level
0,YB5jCG9jORj5PDq5DQOK3,salesconsultant,permanent,fulltime,2020-04-09,salesservicemanagement,,,"[positiveattitude, customerserviceoriented, fl...",[salesserviceexperience],,junior
1,Bt7vbu3oBdF6fNH5SvIm3,managerhostess,permanent,fulltime,2020-08-26,tourismhospitalityhoreca,female higher education in linguistics economi...,manage the daily operations ensure that all st...,,"[english, russianlanguage]",,midlevel
2,oH7bOT6HPYs4xnr7JOQL4,marketingspecialist,permanent,fulltime,2020-08-31,marketingadvertising,,,,[initiative],,senior
3,zK7cxx6TSIV4lLp9sFba8,projectcoordinator,permanent,fulltime,2020-08-31,productprojectmanagement,,,,[b2bsales],,senior
4,yg7GdF9DMxQ0YuO0bcOZ6,networkandsystemadministrator,permanent,fulltime,2020-09-06,otherit,higher technical nbsp education at least 2 yea...,configures and maintains microsoft active dire...,"[abilitytoworkindependently, problemsolving, t...","[linuxos, tcpip, activedirectory]",,midlevel
5,Hs1rHE1aFSn5awY0Nujv6hEW7,sfasdfa,permanent,fulltime,2020-08-26,softwaredevelopment,,,[artisticskills],,,junior
6,sw1WyC2deDc2Qsb9hZCS0Rbj6,iosandandroiddeveloper,permanent,fulltime,2020-07-31,softwaredevelopment,experience in developing native ios app s avai...,,,"[androidndk, iossdk, java, androidsdk]",,notdefined
7,hh1PkS3HjoI4KhT0XPCO8psb3,,permanent,fulltime,2020-08-28,networkadministration,proven experience as a system administrator ne...,manage network servers and technology tools se...,"[teamwork, senseofresponsibility, enterprising]",,,senior
8,Zi1VzG4lDLH7qYH6wVsa5xnP3,sdffe,permanent,fulltime,2020-08-18,qualityassurancecontrol,,,,[phpzendframework],,junior
9,FO1lSO5ZBtg3wdA3OnZQ3UUJ7,digitalproductowner,permanent,fulltime,2020-02-10,otherit,bachelor rsquo s degree in computer science bu...,take the lead of a scrum team as a technical p...,"[analyticalskills, teamwork, productoriented]",,,notdefined


In [167]:
def rep(s, t=100):
    return (s + ' ') * t

def create_soup(x):
    classes = rep(x['title']) + rep(x['employment_terms'], 1) + rep(x['job_type'], 1) + rep(x['category']) + rep(x['candidate_level'])
    description = x['required_qualifications'] + ' ' + x['responsibilities'] + ' ' + ' '.join(x['soft_skills']) + ' ' + ' '.join(x['prof_skills'])
    return classes + description

df['soup'] = df.apply(create_soup, axis=1)

df.iloc[0]['soup']

'salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesconsultant salesco

In [168]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_mat = count.fit_transform(df['soup'])

count_mat.shape

(6422, 13630)

In [169]:
test_job = {
    'id': 'test_job',
    'title': condense_str('Marketing Specialist'),
    'employment_terms': condense_str('Permanent'),
    'job_type': condense_str('Full time'),
    'deadline': condense_str('2020-12-31'),
    'category': condense_str('Marketing/Advertising'),
    'required_qualifications': clean_str("""Education background does not matter; we are looking for talent, experience, passion and creativity.
        At least 1 year of professional full-time experience in social media marketing is preferred.
        Experience with running social media pages such as Facebook is a must, experience with Linkedin, Insta and YouTube is a plus.
        Excellent in creative content writing, 
        Experience and knowledge of Adobe Photoshop (knowledge of other tools is a plus), for creating social media postings, based on visual templates provided by the designer.
        Strong written and verbal communication in Armenian and English languages, Russian is desirable.
        Positive attitude, detail and customer oriented with good multitasking and organisational ability."""),
    'responsibilities': clean_str("""Develop original and exciting SMM campaigns and content on a daily basis (e.g. social media posts, website content, etc).
        Coordinate with marketing and design teams to generate relevant marketing content, 
        Manage staff.am's & HireBee's social media presence on Facebook, Linkedin, Telegram, Instagram and YouTube.
        Prepare successful email marketing campaigns with well-structured content.
        Maintain appropriate tone of voice through social media and other digital channels.
        Suggest and implement other marketing activities to boost awareness and increase website traffic and app installs.
        Complete other tasks related to Marketing as required."""),
    'soft_skills': to_list('Written communication skills,Positive attitude,Time management,Team player'),
    'prof_skills': to_list('Adobe Photoshop,SMM,Email Marketing,Content marketing'),
    'salary': condense_str('NaN'),
    'candidate_level': condense_str('Mid level')
}

test_job['soup'] = create_soup(test_job)
test_job['soup']

'marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist marketingspecialist

In [170]:
count2 = CountVectorizer(vocabulary=count.vocabulary_.keys())
count2_mat = count2.fit_transform([test_job['soup']])

count2_mat.shape

(1, 13630)

In [171]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_mat, count2_mat)
df = df.reset_index(drop=True)
indices = pd.Series(df.index, index=df['id'])

cosine_sim.shape

(6422, 1)

In [173]:
sim_scores = list(enumerate(np.squeeze(np.asarray(cosine_sim))))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[0:10]

candidate_idxs = [i[0] for i in sim_scores]
df.iloc[candidate_idxs]

Unnamed: 0,id,title,employment_terms,job_type,deadline,category,required_qualifications,responsibilities,soft_skills,prof_skills,salary,candidate_level,soup
2793,ub1XiD9Imwg2SoD2qMNJ2kxg8,admissionscounselor,permanent,fulltime,2020-05-01,administrativeofficework,knowledge undergraduate degree one year of exp...,counseling 1 accurately and clearly communicat...,"[writtencommunicationskills, positiveattitude]",,,notdefined,admissionscounselor admissionscounselor admiss...
314,xJ1oTn6KFvk5bGF3banD2YtO7,cassandraadministrator,permanent,fulltime,2020-02-12,softwaredevelopment,experience with cassandra administration famil...,developing and supporting services that provid...,"[problemsolving, teamwork, communicationskills]","[docker, cassandra]",Highly competitive.,,cassandraadministrator cassandraadministrator ...
2533,fO1vkH8GCKF9Tmq0OxEE8XnT6,cassandraadministrator,permanent,fulltime,2020-04-15,softwaredevelopment,experience with cassandra administration famil...,developing and supporting services that provid...,"[problemsolving, teamwork, communicationskills]","[docker, cassandra]",Highly competitive.,senior,cassandraadministrator cassandraadministrator ...
312,Ky1vAV6QpEB5HQb3sKxc0NED3,dataplatformengineer,permanent,fulltime,2020-02-12,softwaredevelopment,solid knowledge and skills in programming pyth...,developing and supporting services that provid...,"[problemsolving, teamwork, communicationskills]","[python, cassandra, databasepostgresql, golang]",Highly competitive.,,dataplatformengineer dataplatformengineer data...
2536,TA1WZF8wyaA9oCo1uaOe1Eit8,dataplatformengineeronlineretailer,permanent,fulltime,2020-04-15,softwaredevelopment,solid knowledge and skills in programming pyth...,developing and supporting services that provid...,"[problemsolving, teamwork, communicationskills]","[python, cassandra, databasepostgresql, golang]",Highly competitive.,senior,dataplatformengineeronlineretailer dataplatfor...
1452,Fg1RUR7rdNG7ISp4qCQY1oei6,devopsengineer,permanent,fulltime,2020-03-15,softwaredevelopment,2 years of devops experience experience with a...,deploying tomcat and react applications releas...,"[teamwork, fastlearningability, communications...","[gitgithub, nginx, saas, docker, tomcat, aws]",,notdefined,devopsengineer devopsengineer devopsengineer d...
4771,Hp2umH6Nugw6vbp5tMut3UuP7,angular6developerseniormiddle,permanent,fulltime,2020-08-07,softwaredevelopment,nbsp at least 2 years nbsp experience with ang...,employ the latest technologies frameworks and ...,,[angularjs],,senior,angular6developerseniormiddle angular6develope...
4148,HB2KNL5Mhuo8nOu1btsT6ovi3,3dgeneralist,permanent,fulltime,2020-07-10,artdesignarchitecture,proven experience as animator nbsp degree in c...,read scripts and storylines to understand anim...,"[creativity, continuouslearningdrive]",,,notdefined,3dgeneralist 3dgeneralist 3dgeneralist 3dgener...
4946,Lv2bYq6HOjf8Qxd9koPj6aFW9,3dartistanimator,permanent,fulltime,2020-08-15,webgraphicdesign,must have a degree in fine arts animation or e...,,,"[3dmodeling, 3dsmax, maya, unity]",,junior,3dartistanimator 3dartistanimator 3dartistanim...
4237,ar2RHU5UHLr9RfK4tTeB6hPl2,seniorprocurementassociate,contract,fulltime,2020-06-29,procurementlogisticscourier,standard minimum qualifications education nbsp...,1 provide operational coordination and guidanc...,,[procurement],,notdefined,seniorprocurementassociate seniorprocurementas...


In [174]:
# df.loc[df['id'] == 'Bt7vbu3oBdF6fNH5SvIm3'].responsibilities