In [1]:
import pandas as pd
import numpy as np
import random
from py2neo import Graph
from sklearn.manifold import TSNE
import seaborn as sns

In [2]:
df = pd.read_csv('New_LinkedIn_users.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,memberUrn,ageEstimate,companyName,companyStaffCount,companyUrl,mbrTitle,startDate,endDate,ESCO_job_title,predictionConfidence
0,0,urn:li:member:10013136,41,Commonwealth Bank,32905.0,http://www.commbank.com.au/,Portfolio Executive at Commonwealth Bank,2007-01-02,01/07/2008,executive assistant,86.0
1,1,urn:li:member:10013136,41,CommSec,619.0,http://www.commsec.com.au,Portfolio Executive at Commonwealth Bank,2008-01-08,01/12/2012,executive assistant,86.0
2,2,urn:li:member:10013136,41,Commonwealth Bank,32905.0,http://www.commbank.com.au/,Portfolio Executive at Commonwealth Bank,2013-01-11,01/06/2014,executive assistant,86.0
3,3,urn:li:member:10013136,41,Commonwealth Bank,32905.0,http://www.commbank.com.au/,Portfolio Executive at Commonwealth Bank,2014-01-07,,executive assistant,86.0
4,4,urn:li:member:100186032,30,Jigsaw Clothing,447.0,http://www.jigsaw-online.com,"Senior Marketing Manager, PayPal",2009-01-01,01/12/2009,marketing manager,90.0


In [3]:
print(df[df['memberUrn']=='urn:li:member:10013136']['ESCO_job_title'].apply(pd.Series).stack().tolist())

['executive assistant', 'executive assistant', 'executive assistant', 'executive assistant']


In [4]:
unique_urns = list(set(df['memberUrn'].apply(pd.Series).stack().tolist()))
print(len(unique_urns))

6853


In [5]:
urn_jobhist = []
for urn in unique_urns:
    jobhist = df[df['memberUrn']==urn]['ESCO_job_title'].apply(pd.Series).stack().tolist()
    urn_jobhist.append([urn,jobhist])
urn_jobhist_df = pd.DataFrame(urn_jobhist,columns = ["urn", "ESCO_job_history"])

In [6]:
urn_jobhist_df.head()

Unnamed: 0,urn,ESCO_job_history
0,urn:li:member:57480318,"[hospitality revenue manager, hospitality reve..."
1,urn:li:member:12654189,"[hospitality revenue manager, hospitality reve..."
2,urn:li:member:214981121,[mechanical engineer]
3,urn:li:member:241270390,"[tobacco shop manager, tobacco shop manager, t..."
4,urn:li:member:43156854,"[digital forensics expert, digital forensics e..."


In [7]:
job_skills_df = pd.read_csv('title_uri_skills.csv')
job_skills_df.head()

Unnamed: 0,job_title,uri,essential_skills,optional_skills
0,3D animator,http://data.europa.eu/esco/occupation/52df9d56...,"['3D texturing', 'create 3D environments', 'pa...","['manage schedule of tasks', 'create animated ..."
1,3D modeller,http://data.europa.eu/esco/occupation/bab5fa79...,"['use polygonal modelling', 'develop creative ...","['C++', 'Smalltalk (computer programming)', 'c..."
2,3D printing technician,http://data.europa.eu/esco/occupation/4cf7be91...,"['use technical drawing software', 'create sol...","['CADD software', 'manual draughting technique..."
3,abrasive blasting operator,http://data.europa.eu/esco/occupation/3c0af499...,"['wear appropriate protective gear', 'remove i...","['set up the controller of a machine', 'monito..."
4,absorbent pad machine operator,http://data.europa.eu/esco/occupation/c706886d...,"['adhesives', 'supply machine', 'troubleshoot'...","['monitor paper reel', 'operate compression ro..."


In [8]:
def assign_skills(df_in):
    df = df_in.copy()
    essential_percentage = 0.5
    optional_percentage = 0.2
    skills_list = []
    for line in df.iterrows():
        skills = set()
        essential_percentage_per_occupation = essential_percentage# / len(line[1]['ESCO_job_history'])
        optional_percentage_per_occupation = optional_percentage# / len(line[1]['ESCO_job_history'])
        for job in line[1]['ESCO_job_history']:
            #the try-except block are necessary, since some occupations do not have any associated skills, these would throw an error otherwise
            try:
                for e_skill in job_skills_df[job_skills_df['job_title']==job]['essential_skills'].item().replace('[','').replace(']','').replace("'",'').split(','):
                    if np.random.uniform() < essential_percentage_per_occupation:
                        skills.add(e_skill.strip())
            except:
                pass
            try:
                for o_skill in job_skills_df[job_skills_df['job_title']==job]['optional_skills'].item().replace('[','').replace(']','').replace("'",'').split(','):
                    if np.random.uniform() < optional_percentage_per_occupation:
                        skills.add(o_skill.strip())
            except:
                pass
        skills_list.append(list(skills))
    df['skills'] = skills_list
    return df

In [9]:
df_with_skills = assign_skills(urn_jobhist_df)
df_with_skills.head()

Unnamed: 0,urn,ESCO_job_history,skills
0,urn:li:member:57480318,"[hospitality revenue manager, hospitality reve...","[forecast occupancy demand, manage hospitality..."
1,urn:li:member:12654189,"[hospitality revenue manager, hospitality reve...","[conduct financial audits, forecast occupancy ..."
2,urn:li:member:214981121,[mechanical engineer],"[firmware, debug software, medical device regu..."
3,urn:li:member:241270390,"[tobacco shop manager, tobacco shop manager, t...","[supervise merchandise displays, obtain releva..."
4,urn:li:member:43156854,"[digital forensics expert, digital forensics e...","[WhiteHat Sentinel, perform security vulnerabi..."


There are a couple of way to assign jobs to these people. A first one that I will explore is to make a graph embedding of the people (urns) and the occupations based on the skills they have/require. We use node2vec to create the ebedding.

To do this, we first have to enter these people in the database and do some more preprocessing. We again create weighted edges between people or people and jobs depending on the skills they share.

In [10]:
#first, remove all the people that did not get assigned any skills
for i,line in enumerate(df_with_skills.iterrows()):
    if not line[1]['skills']:
        df_with_skills['skills'][i] = np.nan
df_with_skills_cleaned = df_with_skills.dropna()
df_with_skills_cleaned.head()

Unnamed: 0,urn,ESCO_job_history,skills
0,urn:li:member:57480318,"[hospitality revenue manager, hospitality reve...","[forecast occupancy demand, manage hospitality..."
1,urn:li:member:12654189,"[hospitality revenue manager, hospitality reve...","[conduct financial audits, forecast occupancy ..."
2,urn:li:member:214981121,[mechanical engineer],"[firmware, debug software, medical device regu..."
3,urn:li:member:241270390,"[tobacco shop manager, tobacco shop manager, t...","[supervise merchandise displays, obtain releva..."
4,urn:li:member:43156854,"[digital forensics expert, digital forensics e...","[WhiteHat Sentinel, perform security vulnerabi..."


In [11]:
expanded_list = []
for i, row in df_with_skills_cleaned.iterrows():
    urn = row['urn']
    for skill in row['skills']:
        if skill:
            expanded_list.append([urn, skill])
expanded_df = pd.DataFrame(expanded_list, columns=['urn','skill'])
expanded_df.to_csv('expanded_df_with_skills.csv',index=False)
expanded_df.head()

Unnamed: 0,urn,skill
0,urn:li:member:57480318,forecast occupancy demand
1,urn:li:member:57480318,manage hospitality revenue
2,urn:li:member:57480318,comply with food safety and hygiene
3,urn:li:member:57480318,produce statistical financial records
4,urn:li:member:57480318,inspect data


In [12]:
expanded_list = []
for i, row in df_with_skills_cleaned.iterrows():
    urn = row['urn']
    for job in row['ESCO_job_history']:
        if job:
            expanded_list.append([urn, job])
expanded_df = pd.DataFrame(expanded_list, columns=['urn','job'])
expanded_df.to_csv('expanded_df_with_jobhist.csv',index=False)
expanded_df.head()

Unnamed: 0,urn,job
0,urn:li:member:57480318,hospitality revenue manager
1,urn:li:member:57480318,hospitality revenue manager
2,urn:li:member:57480318,hospitality revenue manager
3,urn:li:member:57480318,hospitality revenue manager
4,urn:li:member:57480318,hospitality revenue manager


In [13]:
len(df_with_skills_cleaned)

6222

In [14]:
with open('credentials.txt','r') as f:
    ls = f.read().split(', ')

DN_HOST = 'localhost'
DB_PORT = 7687
DB_USERNAME = ls[0]
DB_PW = ls[1]
DB_NAME = 'neo4j'

graph = Graph(
    host=DN_HOST,
    port=DB_PORT,
    user=DB_USERNAME,
    password=DB_PW,
    name=DB_NAME
)

In [None]:
graph.run("""
    CALL gds.beta.node2vec.write({
    nodeQuery:'MATCH (n) WHERE n:Person or n:Occupation RETURN id(n) AS id',
    relationshipQuery:'MATCH (n)-[a]->(:skill)<-[b]-(m)
        RETURN id(n) AS source, id(m) AS target, sum(a.weight*b.weight) AS weight',
    validateRelationships:false,
    writeProperty:'embedding',
    relationshipWeightProperty:'weight',
    dimension:64})
""")

In [None]:
embed_df = graph.run("""
    MATCH (n) WHERE n:Person or n:Occupation RETURN n.embedding AS node_embedding, labels(n) AS label, n as nodes
""").to_data_frame()
node_embed_2d = TSNE().fit_transform(np.array([x for x in embed_df['node_embedding']]))
sns.scatterplot(x=[x[0] for x in node_embed_2d],y=[x[1] for x in node_embed_2d],hue = [x[0] for x in embed_df['label']])

In [39]:
embed_df = graph.run("""
    MATCH (n) WHERE n:Person or n:Occupation RETURN n.embedding AS node_embedding, labels(n)[0] AS label, n.urn as urn, n.title as title
""").to_data_frame()
np_embed = []
for embed in embed_df['node_embedding']:
    np_embed.append(np.array(embed))
embed_df['np_node_embedding'] = np_embed
occupation_df = embed_df.dropna(subset=['title','np_node_embedding']).reset_index()[['np_node_embedding','label','title']]
person_df = embed_df.dropna(subset=['urn','np_node_embedding']).reset_index()[['np_node_embedding','label','urn']]
print(len(occupation_df))
row_list = []
for _,row in occupation_df.iterrows():
    if row['np_node_embedding']:
        row_list.append('row')
occupation_df = pd.DataFrame(row_list,columns = ['np_node_embedding','label','title'])
print(len(occupation_df))
row_list = []
for _,row in person_df.iterrows():
    if row['np_node_embedding']:
        row_list.append('row')
person_df = pd.DataFrame(row_list,columns = ['np_node_embedding','label','urn'])
person_df.head()

Unnamed: 0,np_node_embedding,label,urn
0,"[-0.4160683751106262, 0.2690389156341553, -0.3...",Person,urn:li:member:63253473
1,"[0.0015685982070863247, 0.3663915991783142, -0...",Person,urn:li:member:47270882
2,"[0.16561155021190643, 0.49607256054878235, 0.5...",Person,urn:li:member:122496914
3,"[-0.548977792263031, 0.15332390367984772, -0.5...",Person,urn:li:member:162583050
4,"[-0.4022606611251831, 0.3237892985343933, -0.4...",Person,urn:li:member:258253757


In [40]:
def get_top_k_occupations(embedding,k):
    distances = []
    for i,embed in enumerate(occupation_df['np_node_embedding']):
        distance = np.linalg.norm(embedding-embed)
        distances.append(distance)
    distances = np.array(distances)
    idx = np.argsort(distances)
    top_k_distances = distances[idx[:k]]
    top_k_occupations = occupation_df['title'][idx[:k]]
    transposed = []
    for a,b in zip(top_k_occupations,top_k_distances):
        transposed.append([a,b])
    return pd.DataFrame(transposed,columns = ['occupation','distance'])

def get_top_k_people(embedding,k):
    distances = []
    for i,embed in enumerate(person_df['np_node_embedding']):
        distance = np.linalg.norm(embedding-embed)
        distances.append(distance)
    distances = np.array(distances)
    idx = np.argsort(distances)
    top_k_distances = distances[idx[:k]]
    top_k_people = person_df['urn'][idx[:k]]
    transposed = []
    for a,b in zip(top_k_people,top_k_distances):
        transposed.append([a,b])
    return pd.DataFrame(transposed,columns = ['urn','distance'])

def get_person_history(urn):
    return df_with_skills_cleaned[df_with_skills_cleaned['urn']==urn]['ESCO_job_history'].item()

def get_person_skills(urn):
    return df_with_skills_cleaned[df_with_skills_cleaned['urn']==urn]['skills'].item()

In [41]:
#given a person, find its most applicable occupations
urn_list = df_with_skills_cleaned['urn'].tolist()
rand_urn = random.sample(urn_list,1)[0]
k=10
print(f'We find the best suited jobs for person with urn {rand_urn}.')
print('Their job history consist of:')
for job in get_person_history(rand_urn):
    print(f'\t{job}')
print('')
print('Their skills are:')
for skill in get_person_skills(rand_urn):
    print(f'\t{skill}')
print('')
print(f'Top {k} recommended jobs are:')
occ_df = get_top_k_occupations(person_df[person_df['urn']==rand_urn]['np_node_embedding'].item(),k)
occ_df

We find the best suited jobs for person with urn urn:li:member:269949215.
Their job history consist of:
	hospitality revenue manager
	hospitality revenue manager
	hospitality revenue manager
	hospitality revenue manager
	hospitality revenue manager
	hospitality revenue manager
	hospitality revenue manager
	hospitality revenue manager
	hospitality revenue manager

Their skills are:
	conduct financial audits
	forecast occupancy demand
	comply with food safety and hygiene
	produce statistical financial records
	plan medium to long term objectives
	inspect data
	monitor financial accounts
	apply numeracy skills
	perform market research
	analyse booking patterns
	handle customer complaints
	develop working procedures
	manage staff
	implement sales strategies
	ensure price competitiveness
	implement marketing strategies
	think analytically
	develop business case
	ensure cross-department cooperation
	develop revenue generation strategies
	coach employees
	develop financial statistics reports


Unnamed: 0,occupation,distance
0,tourist information centre manager,1.889223
1,travel agent,1.918077
2,hospitality revenue manager,1.953595
3,rooms division manager,2.026944
4,ICT help desk manager,2.035051
5,bed and breakfast operator,2.04088
6,quick service restaurant team leader,2.05421
7,hospitality establishment receptionist,2.075506
8,housekeeping supervisor,2.09179
9,tourism product manager,2.09573


In [None]:
#given a job find the most applicable people
job_list = job_skills_df['job_title'].tolist()
k=5
check = float('inf')
while check>1:
    try:
        rand_job = random.sample(job_list,1)[0]
        per_df = get_top_k_people(occupation_df[occupation_df['title']==rand_job]['np_node_embedding'].item(),k)
        check = per_df['distance'][0]
    except:
        print(rand_job)
        print(occupation_df[occupation_df['title']==rand_job]['np_node_embedding'])
print(f'We find the best suited people for job {rand_job}.')
print(f'Top {k} recommended people are:')
per_df

wood technology engineer
Series([], Name: np_node_embedding, dtype: object)
civil engineer
Series([], Name: np_node_embedding, dtype: object)
electromechanical engineer
2908    [-0.2983192503452301, -0.026757122948765755, 0...
Name: np_node_embedding, dtype: object
stone polisher
1665    [-0.09390656650066376, 0.16817142069339752, -0...
Name: np_node_embedding, dtype: object
civil engineer
Series([], Name: np_node_embedding, dtype: object)
market research interviewer
797    [-0.08001680672168732, -0.008976626209914684, ...
Name: np_node_embedding, dtype: object
rental manager
1373    [-0.15669380128383636, 0.20945511758327484, -0...
Name: np_node_embedding, dtype: object
import export specialist in textiles and textile semi-finished and raw materials
533    [-0.011233081109821796, 0.1542537361383438, -0...
Name: np_node_embedding, dtype: object
bulldozer operator
2396    [-0.38600319623947144, -0.13696900010108948, 0...
Name: np_node_embedding, dtype: object
groom
298    [-0.4544178247

In [38]:
#who are these people?
print('for these people, we check out their job history and skills.')
for i,person in enumerate(per_df['urn']):
    print(f'person {person}:')
    print('job history:')
    for job in get_person_history(person):
        print(f'\t{job}')
    print('')
#    print('skills:')
#    for skill in get_person_skills(person):
#        print(f'\t{skill}')
#    print('')
#    print('')

for these people, we check out their job history and skills.
person urn:li:member:369215358:
job history:
	trade regional manager
	trade regional manager
	trade regional manager
	trade regional manager
	trade regional manager
	trade regional manager

person urn:li:member:2540568:
job history:
	shoe and leather accessories shop manager
	shoe and leather accessories shop manager
	shoe and leather accessories shop manager
	shoe and leather accessories shop manager
	shoe and leather accessories shop manager
	shoe and leather accessories shop manager
	shoe and leather accessories shop manager
	shoe and leather accessories shop manager

person urn:li:member:107779829:
job history:
	tobacco shop manager
	tobacco shop manager
	tobacco shop manager
	tobacco shop manager
	tobacco shop manager
	tobacco shop manager
	tobacco shop manager
	tobacco shop manager
	tobacco shop manager

person urn:li:member:17340979:
job history:
	project manager
	project manager
	project manager
	project manager

pers

In [205]:
#supervised learning to predict embedding based on skills.
skill_list = list(graph.run("""
    MATCH (a:skill)
    RETURN a.name as skill
""").to_data_frame()['skill'])
def get_skill_id(skill):
    for i,sk in enumerate(skill_list):
        if sk==skill:
            return i
        
def one_hot(skills):
    out = np.zeros(len(skill_list))
    for skill in skills:
        out[get_skill_id(skill)]=1
    return out

In [210]:
in_out = []
for i,row in person_df.iterrows():
    in_ = one_hot(get_person_skills(row['urn']))
    out = row['np_node_embedding']
    in_out.append([in_,out])
test_train_df = pd.DataFrame(in_out,columns=['input','target'])
test_train_df.head()

Unnamed: 0,input,target
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.18186289072036743, -0.31261634826660156, -..."
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.005847466178238392, -0.4316405951976776, -..."
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.19794590771198273, -0.846646785736084, -0.0..."
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.021608319133520126, -0.5021878480911255, 0...."
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.11355864256620407, -0.2245677262544632, -0...."


In [212]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(test_train_df)
df_train.head()

Unnamed: 0,input,target
382,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.23980486392974854, 0.2894839346408844, 0.05..."
4319,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.4427393078804016, -0.05583968758583069, -0..."
3561,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.29256486892700195, -0.2215687334537506, -0...."
4563,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.27869537472724915, -0.0892919972538948, -0..."
3131,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.294991672039032, -0.2395581752061844, -0.16..."


In [None]:
#
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
param_grid = [
    {'n_estimators':[30,100,300,1000], 'max_features':[2,4,6,8]},
    {'bootstrap':[False], 'n_estimators': [100,300,1000], 'max_features': [2,3,4]},
]
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error',
                          return_train_score=True)
    
grid_search.fit(df_train['input'].tolist(),df_train['target'].tolist())

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'],cvres['params']):
    print(np.sqrt(-mean_score),params)
grid_search.best_estimator_

In [None]:
from sklearn.metrics import mean_squared_error
pred = grid_search.best_estimator_.predict(df_test['input'].tolist())
target = df_test['target'].tolist()
mse = mean_squared_error(pred,target)
print(mse)