# Dependencies 

In [2]:
from models import InferSent
import torch

import pandas as pd
import numpy as np

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import re

from scipy import spatial

import requests
import urllib.parse

import geopy.distance
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

# Downloading the pretrained model and vectors

In [None]:
#saving the trained model and pre-trained GLoVe word vectors
! mkdir encoder
! curl -Lo encoder/infersent1.pkl https://dl.fbaipublicfiles.com/infersent/infersent1.pkl #glove model
! curl -Lo encoder/infersent2.pkl https://dl.fbaipublicfiles.com/infersent/infersent2.pkl #fasttext model
  
! mkdir GloVe
! curl -Lo GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
! unzip GloVe/glove.840B.300d.zip -d GloVe/

# Reading the data

In [3]:
df = pd.read_csv('potential-talents.csv')
df

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,
...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,
102,103,Always set them up for Success,Greater Los Angeles Area,500+,


In [4]:
#using openstreetmap api to get the lattitude and longitude of the location - which will be later used for ranking
def location(city_name):
    url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(city_name) +'?format=json'
    response = requests.get(url).json()
    lat = float(response[0]['lat'])
    lon = float(response[0]['lon'])
    return lat,lon

In [5]:
#testing if all location is recognized by openstreetmap api
locs = df.location.unique()
for loc in locs:
    try: 
        url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(loc) +'?format=json'
        response = requests.get(url).json()
        lat = float(response[0]['lat'])
        lon = float(response[0]['lon'])
        print(loc,': Yes')
    except:
        print(loc,': No')

Houston, Texas : Yes
Kanada : Yes
Raleigh-Durham, North Carolina Area : Yes
Denton, Texas : Yes
İzmir, Türkiye : Yes
Greater New York City Area : No
San Francisco Bay Area : Yes
Greater Philadelphia Area : Yes
Lake Forest, California : Yes
Houston, Texas Area : Yes
Atlanta, Georgia : Yes
Chicago, Illinois : Yes
Austin, Texas Area : Yes
Jackson, Mississippi Area : Yes
Greater Grand Rapids, Michigan Area : No
Virginia Beach, Virginia : Yes
Monroe, Louisiana Area : Yes
Greater Boston Area : Yes
San Jose, California : Yes
New York, New York : Yes
Dallas/Fort Worth Area : Yes
Amerika Birleşik Devletleri : Yes
Baton Rouge, Louisiana Area : Yes
Myrtle Beach, South Carolina Area : Yes
Chattanooga, Tennessee Area : Yes
Los Angeles, California : Yes
Highland, California : Yes
Gaithersburg, Maryland : Yes
Baltimore, Maryland : Yes
Milpitas, California : Yes
Greater Atlanta Area : Yes
Greater Chicago Area : Yes
Torrance, California : Yes
Long Beach, California : Yes
Bridgewater, Massachusetts : Ye

Looks like 3 entries are not recognized by the api: 
- 'Greater New York City Area' 
- 'Greater Grand Rapids, Michigan Area' 
- 'Greater Los Angeles Area'

Changing these in the original dataframe: 
- New York 
- Michigan
- Los Angeles

In [6]:
df['location'] = df['location'].replace('Greater New York City Area', 'New York')
df['location'] = df['location'].replace('Greater Grand Rapids, Michigan Area', 'Michigan')
df['location'] = df['location'].replace('Greater Los Angeles Area', 'Los Angeles')

In [7]:
#sentence preprocessing function
def sentence_transform(sentences):
    lemmatizer = WordNetLemmatizer()
    sentence = re.sub('[^a-zA-Z]', ' ', sentences) #substituing all non-alphabets in a sentence with space
    sentence = sentence.lower() #converting to lowercase
    sentence = nltk.word_tokenize(sentence) #word tokenization
    sentence = [lemmatizer.lemmatize(word) for word in sentence if word not in set(stopwords.words('english'))] #lemmatization
    sentence = ' '.join(sentence) #joining the words to construct the sentence
    return sentence

In [11]:
#ranking function
def ranking(df, job_query, location_query = None):
    
    #loading model and word embeddings
    V = 1
    MODEL_PATH = 'encoder/infersent%s.pkl' % V
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'GloVe/glove.840B.300d.txt'
    model.set_w2v_path(W2V_PATH)
    
    #sentence pre-processing
    sentences = []
    for idx in range(len(df)):
        sentences.append(sentence_transform(df['job_title'].iloc[idx]))
    
    #build the vocab
    model.build_vocab(sentences, tokenize=True)

    #query embeddings
    query_vec = model.encode([sentence_transform(job_query)])[0]

    #initializing the column for job rankings
    if 'job_title_ranking' not in df:
        df['job_title_ranking'] = np.nan 
    
    #initalizing the column for location rankings
    if location_query != None:
        if 'location_ranking' not in df:
            df['location_ranking'] = np.nan 
        
    #calculating cosine similarity and geograhical distance using openstreetmap api 
    for idx in range(len(df)):
        df['job_title_ranking'].loc[idx] = 1 - spatial.distance.cosine(query_vec, model.encode([sentence_transform(df['job_title'][idx])])[0])
        if location_query != None:
            df['location_ranking'].loc[idx] = geopy.distance.distance(location(location_query), location(df['location'][idx])).km
    
    #ranking logic
        #if job title ranking is greater than 0.5, both job title and location have equal contribution to the fit rank
        #if job title ranking is less than 0.5, then job title accounts for 90% to the fit rank
    if location_query != None:
        min_max_scaler = MinMaxScaler()
        df['location_ranking'] = 1 - min_max_scaler.fit_transform(df[['location_ranking']]) #ranking the geographical distance b/w 0 and 1
        for idx in range(len(df)):
            if df['job_title_ranking'].loc[idx] > 0.5:
                df['fit'].loc[idx] = 0.5 * df['job_title_ranking'].loc[idx] + 0.5 * df['location_ranking'].loc[idx]
            else:
                df['fit'].loc[idx] = 0.9 * df['job_title_ranking'].loc[idx] + 0.1 * df['location_ranking'].loc[idx]
    else:
        df['fit'] = df['job_title_ranking'] #if location is not included in query
        
    return df

In [13]:
#initial ranking
job_query = 'seeking human resources'
location_query = 'texas'
initial_rank = df
initial_rank = ranking(initial_rank, job_query, location_query)
initial_rank

Found 172(/181) words with w2v vectors
Vocab size : 172


Unnamed: 0,id,job_title,location,connection,fit,job_title_ranking,location_ranking
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.451804,0.393391,9.775191e-01
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.292748,0.248240,6.933263e-01
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.813427,0.800410,8.264443e-01
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.530772,0.479895,9.886667e-01
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.273993,0.304437,1.110223e-16
...,...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,0.721292,0.534105,9.084785e-01
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.838018,0.849592,8.264443e-01
101,102,Business Intelligence and Analytics at Travelers,New York,49,0.647383,0.519763,7.750032e-01
102,103,Always set them up for Success,Los Angeles,500+,0.399766,0.349830,8.491880e-01


In [14]:
#top 50
initial_rank.sort_values(by='fit', ascending=False)[0:50]

Unnamed: 0,id,job_title,location,connection,fit,job_title_ranking,location_ranking
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.904189,0.943102,0.865276
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.904189,0.943102,0.865276
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.900825,0.929863,0.871787
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.855082,0.735719,0.974446
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.838018,0.849592,0.826444
26,27,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,0.836049,0.697651,0.974446
28,29,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,0.836049,0.697651,0.974446
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.833085,0.80041,0.86576
93,94,Seeking Human Resources Opportunities. Open t...,Amerika Birleşik Devletleri,415,0.81461,0.701601,0.92762
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.813427,0.80041,0.826444


In [15]:
#rank 50-100
initial_rank.sort_values(by='fit', ascending=False)[50:100]

Unnamed: 0,id,job_title,location,connection,fit,job_title_ranking,location_ranking
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.530772,0.479895,0.988667
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.530772,0.479895,0.988667
33,34,People Development Coordinator at Ryan,"Denton, Texas",500+,0.530772,0.479895,0.988667
46,47,People Development Coordinator at Ryan,"Denton, Texas",500+,0.530772,0.479895,0.988667
17,18,People Development Coordinator at Ryan,"Denton, Texas",500+,0.530772,0.479895,0.988667
69,70,"Retired Army National Guard Recruiter, office ...","Virginia Beach, Virginia",82,0.521476,0.490478,0.800463
55,56,Human Resources Coordinator at InterContinenta...,"Atlanta, Georgia",500+,0.51109,0.470254,0.878619
64,65,Human Resources Coordinator at InterContinenta...,"Atlanta, Georgia",500+,0.51109,0.470254,0.878619
42,43,Human Resources Coordinator at InterContinenta...,"Atlanta, Georgia",500+,0.51109,0.470254,0.878619
12,13,Human Resources Coordinator at InterContinenta...,"Atlanta, Georgia",500+,0.51109,0.470254,0.878619


In [16]:
#suppose from the above we notice that id 82 is the ideal candidate (19th in top 50)
job_query = initial_rank['job_title'].loc[81]
location_query = initial_rank['location'].loc[81]
re_rank = ranking(initial_rank, job_query, location_query)
re_rank

Found 172(/181) words with w2v vectors
Vocab size : 172


Unnamed: 0,id,job_title,location,connection,fit,job_title_ranking,location_ranking
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.825222,0.672960,9.774830e-01
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.492106,0.472771,6.661207e-01
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.824122,0.826465,8.217797e-01
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.761336,0.553975,9.686977e-01
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.376484,0.418315,1.110223e-16
...,...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,0.813631,0.731053,8.962081e-01
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.721848,0.621916,8.217797e-01
101,102,Business Intelligence and Analytics at Travelers,New York,49,0.679043,0.589998,7.680892e-01
102,103,Always set them up for Success,Los Angeles,500+,0.416930,0.373040,8.119406e-01


In [17]:
#top 50
re_rank.sort_values(by='fit', ascending=False)[0:50]

Unnamed: 0,id,job_title,location,connection,fit,job_title_ranking,location_ranking
81,82,Aspiring Human Resources Professional | An ene...,"Austin, Texas Area",174,1.0,1.0,1.0
65,66,Experienced Retail Manager and aspiring Human ...,"Austin, Texas Area",57,0.896681,0.793362,1.0
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.889009,0.803448,0.974569
28,29,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,0.882551,0.790533,0.974569
26,27,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,0.882551,0.790533,0.974569
71,72,Business Management Major and Aspiring Human R...,"Monroe, Louisiana Area",5,0.859758,0.775664,0.943851
66,67,"Human Resources, Staffing and Recruiting Profe...","Jackson, Mississippi Area",500+,0.849796,0.784002,0.91559
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.839688,0.826465,0.852911
78,79,Liberal Arts Major. Aspiring Human Resources A...,"Baton Rouge, Louisiana Area",7,0.829781,0.720125,0.939438
14,15,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.825222,0.67296,0.977483


In [18]:
#rank 50-100
re_rank.sort_values(by='fit', ascending=False)[50:100]

Unnamed: 0,id,job_title,location,connection,fit,job_title_ranking,location_ranking
42,43,Human Resources Coordinator at InterContinenta...,"Atlanta, Georgia",500+,0.731207,0.588008,0.874407
64,65,Human Resources Coordinator at InterContinenta...,"Atlanta, Georgia",500+,0.731207,0.588008,0.874407
55,56,Human Resources Coordinator at InterContinenta...,"Atlanta, Georgia",500+,0.731207,0.588008,0.874407
12,13,Human Resources Coordinator at InterContinenta...,"Atlanta, Georgia",500+,0.731207,0.588008,0.874407
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.728119,0.676215,0.7800236
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.728119,0.676215,0.7800236
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.728119,0.676215,0.7800236
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.728119,0.676215,0.7800236
85,86,Information Systems Specialist and Programmer ...,"Gaithersburg, Maryland",4,0.727795,0.6573,0.7982895
70,71,"Human Resources Generalist at ScottMadden, Inc.","Raleigh-Durham, North Carolina Area",500+,0.724642,0.627505,0.8217797
