In [1]:
from flask import Flask,  jsonify,request
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.metrics import mean_squared_error
from math import sqrt
import pandas as pd
import string

In [2]:
original_data_loaded = pd.read_csv('jobs_skills.csv') #original data loaded
original_data_loaded.columns = ['id','title', 'jobFunction', 'industry', 'skills'] # cloumns we have

enhanced_data=pd.read_csv('jobs_skills.csv') #enhanced data will be saved here
enhanced_data.columns = ['id','title', 'jobFunction', 'industry', 'skills'] # cloumns we have

enhanced_data.sample(10)# discover data we have

Unnamed: 0,id,title,jobFunction,industry,skills
64976,5da165a0e43fd1a7ab67b604,Biotechnology Executive - Fresh Graduate,"['Marketing/PR/Advertising', 'Sales/Retail']",['Healthcare and Medical Services'],"['Medical', 'Sales', 'Sales Target', 'Biotechn..."
70314,5da1700ae43fd1a7ab67cadc,Call Center Agent - Collection,['Customer Service/Support'],['Healthcare and Medical Services'],"['Accounting', 'Finance', 'Startup', 'Collecti..."
44230,5da13ddae43fd1a7ab6764f8,Customer Support Specialist,"['Customer Service/Support', 'Logistics/Supply...",['Manufacturing'],"['Customer Support', 'Logistics', 'Sales', 'Cu..."
561,5da0eea4e43fd1a7ab66ba63,Electromechanical Engineer,['Engineering - Mechanical/Electrical'],['Real Estate/Property Management'],"['Electrical Engineering', 'Design', 'Mechanic..."
38503,5da13320e43fd1a7ab674e9b,Quotation Engineer,['Engineering - Mechanical/Electrical'],"['Manufacturing', 'Energy and Utilities', 'Eng...","['Quotation Engineering', 'Electrical Engineer..."
32061,5da12726e43fd1a7ab67356f,Senior General Ledger Accountant - Hurghada,['Accounting/Finance'],['Real Estate/Property Management'],"['Auditing', 'ACCA', 'Accounting', 'Business A..."
1535,5da0f04fe43fd1a7ab66be32,Senior Planning Specialist,['Engineering - Construction/Civil/Architectur...,['Construction - Residential & Commercial/Offi...,"['Planning Engineering', 'Planning', 'Civil En..."
18747,5da10ec2e43fd1a7ab67016d,Warehouse Keeper,['Logistics/Supply Chain'],['Manufacturing'],"['Logistics', 'Microsoft Word', 'Warehousing',..."
38144,5da13273e43fd1a7ab674d32,Architect - Working Drawing + BIM,['Engineering - Construction/Civil/Architecture'],"['Consulting Services', 'Engineering Services'...","['NFPA', 'Tendering', 'REVIT', 'ADA', 'Civil E..."
67423,5da16a62e43fd1a7ab67bf90,Real Estate Sales Specialist,"['Customer Service/Support', 'Sales/Retail']",['Real Estate/Property Management'],"['Customer Support', 'Sales', 'Property', 'Rea..."


In [3]:
#removing special characters
enhanced_data['title']=enhanced_data['title'].str.encode('ascii', 'ignore').str.decode('ascii')
enhanced_data['jobFunction']=enhanced_data['jobFunction'].str.encode('ascii', 'ignore').str.decode('ascii')
enhanced_data['industry']=enhanced_data['industry'].str.encode('ascii', 'ignore').str.decode('ascii')
enhanced_data['skills']=enhanced_data['skills'].str.encode('ascii', 'ignore').str.decode('ascii')
enhanced_data.sample(5)

Unnamed: 0,id,title,jobFunction,industry,skills
54327,5da15122e43fd1a7ab678c68,Mathematics Content Developer,"['Writing/Editorial', 'Education/Teaching']","['Education', 'E-Learning']","['Mathematics', 'Educational Content', 'Design..."
78622,5da17fcde43fd1a7ab67eb4f,Accountant,['Accounting/Finance'],"['Accounting and Auditing Services', 'Manufact...","['IFRS', 'Accounting', 'Financial Analysis', '..."
39863,5da135ace43fd1a7ab6753e9,Sales Executive,['Sales/Retail'],['Real Estate/Property Management'],"['Marketing Plans', 'Sales', 'Sales Target', '..."
82497,5da18717e43fd1a7ab67fa73,Customer Service Manager,['Customer Service/Support'],['Real Estate/Property Management'],"['Customer Support', 'Customer Service', 'Comp..."
9025,5da0fd22e43fd1a7ab66db73,Admin Assistant,['Administration'],['Information Technology Services'],"['Business Administration', 'Administration', ..."


In [4]:
#tokenization will done here too
#lower case all letters
enhanced_data['title'] = enhanced_data['title'].str.lower().str.replace(r'[^\w\s]+', ' ').str.split()
#enhanced_data['jobFunction'] = enhanced_data['jobFunction'].str.lower().str.replace(r'[^\w\s]+', ' ').str.split()
#enhanced_data['industry'] = enhanced_data['industry'].str.lower().str.replace(r'[^\w\s]+', ' ').str.split()
enhanced_data['skills'] = enhanced_data['skills'].str.lower().str.replace(r'[^\w\s]+', ' ').str.split()
enhanced_data

Unnamed: 0,id,title,jobFunction,industry,skills
0,5da0edb0e43fd1a7ab66b833,"[property, consultant]",['Sales/Retail'],['Real Estate/Property Management'],"[sales, retail, real, estate, sales, target, i..."
1,5da0edb0e43fd1a7ab66b834,"[sales, representative, real, estate]","['Customer Service/Support', 'Sales/Retail']",['Real Estate/Property Management'],"[sales, real, estate, sales, target, customer,..."
2,5da0edb1e43fd1a7ab66b835,[receptionist],['Administration'],['Real Estate/Property Management'],"[admin, work, office, management, administrati..."
3,5da0edafe43fd1a7ab66b831,"[senior, property, consultant]",['Sales/Retail'],['Real Estate/Property Management'],"[sales, real, estate, sales, target, computer,..."
4,5da0edaee43fd1a7ab66b82f,"[senior, seo, specialist]","['Marketing/PR/Advertising', 'Media/Journalism...","['Real Estate/Property Management', 'Marketing...","[marketing, campaigns, e, marketing, digital, ..."
...,...,...,...,...,...
99995,5da1a8f7e43fd1a7ab683ecd,"[applications, unlimited, sales, representativ...","['IT/Software Development', 'Sales/Retail']","['Information Technology Services', 'Computer ...","[peoplesoft, sales, computer, science, outdoor..."
99996,5da1a8f8e43fd1a7ab683ece,"[senior, netsuite, solution, engineer, with, a...","['IT/Software Development', 'Engineering - Tel...","['Information Technology Services', 'Computer ...","[computer, science, saas, erp, telecommunicati..."
99997,5da1a8f9e43fd1a7ab683ed1,"[sales, representative, alexandria]",['Sales/Retail'],"['Education', 'Business Services - Other', 'Tr...","[customer, service, customer, care, sales, ski..."
99998,5da1a8fae43fd1a7ab683ed2,"[english, instructor, 6th, of, october, part, ...","['Training/Instructor', 'Education/Teaching']",['Education'],"[translation, linguistics, iqp, education, tra..."


In [5]:

lis=list()
lis= "['sales', 'retail', 'real', 'estate', 'sales', 'target', 'indoor', 'sales', 'sales', 'skills', 'property', 'sales']"
print(lis)

print(enhanced_data['skills'][0])

if (enhanced_data['skills'][0] == lis):
    print('yes')
else:
    print('no')

['sales', 'retail', 'real', 'estate', 'sales', 'target', 'indoor', 'sales', 'sales', 'skills', 'property', 'sales']
['sales', 'retail', 'real', 'estate', 'sales', 'target', 'indoor', 'sales', 'sales', 'skills', 'property', 'sales']
no


In [6]:
data= pd.read_csv("job_skills.csv")
data

Unnamed: 0.1,Unnamed: 0,id,title,jobFunction,industry,skills
0,0,5da0edb0e43fd1a7ab66b833,"['property', 'consultant']",['Sales/Retail'],['Real Estate/Property Management'],"['sales', 'retail', 'real', 'estate', 'sales',..."
1,1,5da0edb0e43fd1a7ab66b834,"['sales', 'representative', 'real', 'estate']","['Customer Service/Support', 'Sales/Retail']",['Real Estate/Property Management'],"['sales', 'real', 'estate', 'sales', 'target',..."
2,2,5da0edb1e43fd1a7ab66b835,['receptionist'],['Administration'],['Real Estate/Property Management'],"['admin', 'work', 'office', 'management', 'adm..."
3,3,5da0edafe43fd1a7ab66b831,"['senior', 'property', 'consultant']",['Sales/Retail'],['Real Estate/Property Management'],"['sales', 'real', 'estate', 'sales', 'target',..."
4,4,5da0edaee43fd1a7ab66b82f,"['senior', 'seo', 'specialist']","['Marketing/PR/Advertising', 'Media/Journalism...","['Real Estate/Property Management', 'Marketing...","['marketing', 'campaigns', 'e', 'marketing', '..."
...,...,...,...,...,...,...
99995,99995,5da1a8f7e43fd1a7ab683ecd,"['applications', 'unlimited', 'sales', 'repres...","['IT/Software Development', 'Sales/Retail']","['Information Technology Services', 'Computer ...","['peoplesoft', 'sales', 'computer', 'science',..."
99996,99996,5da1a8f8e43fd1a7ab683ece,"['senior', 'netsuite', 'solution', 'engineer',...","['IT/Software Development', 'Engineering - Tel...","['Information Technology Services', 'Computer ...","['computer', 'science', 'saas', 'erp', 'teleco..."
99997,99997,5da1a8f9e43fd1a7ab683ed1,"['sales', 'representative', 'alexandria']",['Sales/Retail'],"['Education', 'Business Services - Other', 'Tr...","['customer', 'service', 'customer', 'care', 's..."
99998,99998,5da1a8fae43fd1a7ab683ed2,"['english', 'instructor', '6th', 'of', 'octobe...","['Training/Instructor', 'Education/Teaching']",['Education'],"['translation', 'linguistics', 'iqp', 'educati..."


In [16]:
data['skills'][15]

"['analysis', 'business', 'administration', 'reporting', 'packages', 'data', 'collection', 'business', 'analysis']"

In [7]:
data['skills'][5]

"['customer', 'support', 'sales', 'property', 'real', 'estate', 'customer', 'service', 'customer', 'care']"

# Doc2Vec

In [8]:
# Create the tagged document needed for Doc2Vec
import gensim
from gensim.models import Word2Vec
def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

train_data = list(create_tagged_document(enhanced_data['skills']))

print(train_data[2156])
#> [TaggedDocument(words=['anarchism', 'originated', ... 'social', 'or', 'emotional'], tags=[0])]

TaggedDocument(['medical', 'veterinary', 'calves', 'veterinary', 'medical'], [2156])


In [9]:
# Init the Doc2Vec model
model = gensim.models.doc2vec.Doc2Vec(vector_size=60, min_count=0, epochs=10)

# Build the Volabulary
model.build_vocab(train_data)

# Train the Doc2Vec model
model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs)

In [10]:
print(model.infer_vector(['Sales', 'Real Estate', 'Sales Target']))

[-2.0712570e-03  2.4500662e-03  3.2033210e-03  5.7607316e-03
  4.2410325e-03 -8.8395190e-04 -3.0653812e-03 -7.8364508e-03
 -2.6085405e-03 -7.0888799e-04 -3.4298692e-03 -9.6072030e-04
  3.1590736e-03  5.4945964e-03 -7.3045488e-03 -8.1713693e-03
 -7.7010258e-03 -5.5852588e-03  4.8673004e-03  2.4651806e-03
  8.7771466e-04  3.6154406e-03  6.5855281e-03  4.5196693e-03
 -6.0599054e-05  6.3070650e-03 -7.7849901e-03 -7.7988505e-03
 -1.0094794e-03 -1.0779860e-03  1.2444756e-03  6.7080455e-03
  7.3749870e-03  3.8634592e-03 -5.7165180e-03  2.6038187e-03
 -7.3319660e-03  4.6930085e-03  5.6940871e-03 -7.7121858e-03
 -4.7379034e-03 -2.5826681e-04  8.2526216e-03  7.2660861e-03
 -6.3481233e-03  6.5678176e-03 -7.5653628e-03  4.6532722e-03
  5.7943785e-03  3.9861002e-03  5.6279949e-03 -3.8594827e-03
 -4.3370519e-03  6.2152254e-03 -3.8127892e-03  3.9333063e-03
 -4.4246987e-03  2.6482590e-03 -4.6684360e-03 -1.0147508e-03]


In [11]:
from gensim.matutils import softcossim
from gensim import corpora

In [21]:
model.n_similarity(['analysis', 'business', 'administration', 'reporting', 'packages', 'data', 'collection', 'business', 'analysis'],['sales', 'veterinary', 'calves', 'veterinary', 'medical'])

  """Entry point for launching an IPython kernel.


0.4709833

In [13]:
#function get name of job 
def job(skill = [], *args):
    for idx in enhanced_data.index:
        if skill==enhanced_data['skills'][idx]:
            print(enhanced_data['title'][idx])
            break
            
job(['administration', 'biling'])

['billing', 'specialist']


# recommendation using n_similarity

In [14]:
def recommended(skill = [] ,*args):
    li=list()
    for idx in enhanced_data.index:
        sim=model.n_similarity(skill,enhanced_data['skills'][idx])
        #title=job(enhanced_data['skills'][idx])
        if sim>0.97:
            print("Recommended jobs: ")
            print(job(enhanced_data['skills'][idx]))
            print(sim)
            
        
    
    #for i in  li:
      #  print(li[i])
    

In [15]:
recommended(['training', 'real', 'estate', 'sales'])

  after removing the cwd from sys.path.


Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0
Recommended jobs: 
['sales', 'trainer']
None
1.0


In [19]:
#  gap analysis results
def gap_analysis(skill = [] ,*args):
    li=list()
    for idx in enhanced_data.index:
        sim=model.wv.n_similarity(skill,enhanced_data['skills'][idx])
        #result=model.infer_vector(skill) - model.infer_vector(enhanced_data['skills'][idx])
        gap=list(set(skill).symmetric_difference(enhanced_data['skills'][idx]))
        #title=job(enhanced_data['skills'][idx])
        if sim > 0.93:
            print("Recommended job and gap analysis: ")
            print(job(enhanced_data['skills'][idx]))
            print(gap)
            
        
    
   


In [20]:
gap_analysis(['training', 'real', 'estate', 'sales'])

Recommended job and gap analysis: 
['sales', 'trainer']
None
[]
Recommended job and gap analysis: 
['business', 'development', 'executive', 'real', 'estate']
None
['business', 'development']
Recommended job and gap analysis: 
['business', 'development', 'executive', 'real', 'estate']
None
['business', 'development']
Recommended job and gap analysis: 
['sales', 'trainer']
None
[]
Recommended job and gap analysis: 
['business', 'development', 'executive', 'real', 'estate']
None
['business', 'development']
Recommended job and gap analysis: 
['sales', 'trainer']
None
[]
Recommended job and gap analysis: 
['business', 'development', 'executive', 'real', 'estate']
None
['business', 'development']
Recommended job and gap analysis: 
['sales', 'trainer']
None
[]
Recommended job and gap analysis: 
['sales', 'trainer']
None
[]
Recommended job and gap analysis: 
['business', 'development', 'executive', 'real', 'estate']
None
['business', 'development']
Recommended job and gap analysis: 
['sales', 