# Text traitement with spacy/NLTK to extract skills

## Librairies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import feature_extraction
import nltk
from nltk import sent_tokenize 
from nltk.tokenize import word_tokenize,MWETokenizer 
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import spacy
from spacy.util import filter_spans
from spacy.tokens import Span
from spacy.matcher import Matcher
from spacy import displacy
from IPython.display import HTML, display
import gensim
# nltk.download('all')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [2]:
from spacy.util import filter_spans
from spacy.tokens import Span
from spacy.matcher import Matcher

In [3]:
# nlp = spacy.load("en_core_web_sm")

In [4]:
nlp = spacy.load("en_core_web_lg")
stop_words=set(stopwords.words('english'))

## Read data

In [5]:
job_description = pd.read_csv("data/unbalanced.csv")
job_description.head()

Unnamed: 0,Content,source,job_title,label
0,Infoserv LLC\nData Scientist\nRemote\nEmployer...,glassdoor,Data Scientist,job_description
1,"ExxonMobil\n3.1\nData Scientist\nClinton, NJ\n...",glassdoor,Data Scientist,job_description
2,eBay Inc.\n4.1\nData Scientist/Applied Researc...,glassdoor,Data Scientist,job_description
3,"TikTok\n3.7\nData Scientist, University Gradua...",glassdoor,Data Scientist,job_description
4,"Mastercard\n4.3\nData Scientist, AI Services -...",glassdoor,Data Scientist,job_description


In [6]:
examples1 = job_description['Content'][0]
type(examples1)

str

In [7]:
examples = list(job_description[job_description['source']=="assan"]['Content'])

## Tokenisation with nltk

### Remove punctuation

In [251]:
def remove_punct(text):
    """
    Remove punctuation from text
    input: text
    output: text
    """
    text_tok = word_tokenize(text)
    l=[]
    for word in text_tok:
        if  word not in string.punctuation:
            l.append(word)
    resultat=" ".join(l)        
    return resultat,text_tok

In [9]:
results = remove_punct(examples1)
nltk_tokens = results[1]
text = results[0]

In [10]:
type(text)

str

### Remove stopwords

In [252]:
def remove_stopword(text):
    stop=set(stopwords.words('english'))
#     text_tok = word_tokenize(text)
    l=[]
    for word in text.split():
        if not word in stop:
            l.append(word)
    resultat=" ".join(l) 
    return resultat

In [12]:
tokens_stopwords = remove_stopword(text)
tokens_stopwords

'Infoserv LLC Data Scientist Remote Employer Provided Salary 30.00 35.00 Per Hour Only W2 candidates right unable work employer Programming Skills – knowledge statistical programming languages like R Python database query languages like SQL Hive Pig desirable Familiarity Scala Java C++ added advantage Job Type Contract Pay 30.00 35.00 per hour Schedule 8 hour shift Supplemental pay types Bonus pay Work Location Remote'

In [13]:
def tokenizer(text):
    """
    transform text to a list of word
    input: text/str
    output: list of tuples containing the individual words in the sentence and their associated part-of-speech
    """
    tokenizer = MWETokenizer([('Machine', 'Learning'), ('Data','Scientist'),('Data','Engineer'),('Data','Analyst'),
                              ('Cloud','Computing'),('Artificial','Intelligence'),('Business','Managers'),
                             ('Big', 'Data'),('Big', 'Data','Developer'),('Data','Driven'),('Big','Query'),('data','lake'),
                             ('data','sources'),('job','description')],separator=' ')
    for t in sent_tokenize(text):
        x=tokenizer.tokenize(t.split())
        x = nltk.pos_tag(x)
    return(x)

In [100]:
nlp_token = tokenizer(tokens_stopwords)
# nlp_token

* Find ngrams
http://locallyoptimal.com/blog/2013/01/20/elegant-n-gram-generation-in-python/

In [15]:
def generate_ngrams(words, ngram=2):
    return [words[i:i+ngram] for i in range(len(words)-ngram+1)]

In [16]:
words = examples1.split()
# words

In [17]:
res = generate_ngrams(words, ngram=2)
# res

In [18]:
import re
from nltk.util import ngrams

s = examples1.lower()
s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
tokens = [token for token in s.split(" ") if token != ""]
output = list(ngrams(tokens, 2))
# output

## Tokenisation with spacy

In [19]:
spacy_token = nlp(str(examples))
# spacy_token

* NER-Tagging — (Named Entity Recognition)

a subtask of information extraction that seeks to locate and classify named entities mentioned in unstructured text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.
* The Part Of Speech (POS) explains how a word is used in a sentence. 

There are eight main parts of speech — nouns, pronouns, adjectives, verbs, adverbs, prepositions, conjunctions, and interjections.

In [20]:
# for token in spacy_token:
#     print(token.text, token.ent_type_)

In [21]:
# for token in spacy_token:
#     print(token.text, token.pos_, token.tag_)

In [22]:
entities =[]
labels = []
pos_start = []
pos_end = []
for ent in spacy_token.ents:
    entities.append(ent)
    labels.append(ent.label_)
tokens_data = pd.DataFrame({"Entities":entities,"Labels":labels })
tokens_data

Unnamed: 0,Entities,Labels
0,"(Data, Engineer)",ORG
1,(GCP-),PRODUCT
2,(C2C),GPE
3,"(Wednesday, ,, April, 20, ,, 2022)",DATE
4,(10:36:49),TIME
...,...,...
1254,(99),CARDINAL
1255,(308),MONEY
1256,(Iselin),GPE
1257,(which\nimplies),DATE


 The magic of spaCy — just like that, we’ve managed to get rid of stopwords, punctuation markers, and added lemmatized words

In [23]:
from collections import Counter

In [24]:
items = [x.text for x in spacy_token.ents]
Counter(items).most_common(3)

[('IDC Technologies', 34), ('8/24/22', 20), ('408', 19)]

Only tokennize the text raise words that may be are not skills. To have a better input for our model, we need to extract skills from the text iand use it as input for our model

To extract skills 
IDEA 1 
Edward Ross's 3 part series
 * part 1: https://skeptric.com/extract-skills-1-noun-phrase/
 * part 2: https://skeptric.com/extract-skills-2-adpositions/
 * part 3: https://skeptric.com/extract-skills-3-conjugations/
 
 * Finding Types of Experience: https://skeptric.com/notebooks/Parsing%20Experience%20from%20Adzuna%20Job%20Ads.html
 
 * GitHub page of Edward Ross https://github.com/EdwardJRoss/job-advert-analysis/blob/master/notebooks/Extracting%20Role%20Title%20Words.ipynb
 

IDEA 2, after WWWC
use a pretrained or word2Vec model that you trained on the corpus to get a list of technical skills close to a list 
you could get from the internet. For each datapoint you could have a column of the most similar skills
Cluster unigrams and bigrams until you get a technical skills cluster
Some ideas
 * https://datascience.stackexchange.com/questions/30057/to-extract-the-skills-required-for-the-job-given-the-job-description/30066

Detecting the Skills

* Tokenize your raw text into words and expressions
* Remove stop words
* Encode your tokens using an embedding (Word2Vec, FastText, etc)
* Use the list from the previous step to add labels to your data (anything on the list is True, other as False)
* Train a binary classifier (Naive Bayes classifier should be good enough)
* Evaluate your model, feature set, and labels. If needed, refine and repeat.

We could look to extract:

* a series of nouns before the word experience (e.g. "subsea cable engineering experience")
* orexperience as/in something (e.g "experience as a Chef de Partie")

we'll do this using Spacy's Rule Based Matcher

### Skills extraction

#### Highlight experiences and synonyms terms  

In [25]:
from nltk.corpus import wordnet

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
#             if word != l.name():
            synonyms.append(l.name())
    return synonyms

In [129]:
def highlight_terms(terms, texts):
    for doc in nlp.pipe(texts):
        for sentence in set([tok.sent for tok in doc if tok.lower_ in terms]):
            text = sentence.text.strip()
            markup = re.sub(fr'(?i)\b({"|".join(terms)})\b', r'<strong>\1</strong>', text)
            display(HTML(markup))
   

In [113]:
# highlight_terms(['experience','skills','title','knowlege','degree','expertise'],examples)

with the highlighting functions, we can see that skills are often before or after word experience or skill or synonyms like expertise exposure...

#### Extract words in the right of words designating experinces or skills

In [86]:
def get_conjugations(tok):
    """
    take a token and append token that are separated to it by a conjonction or a comma(its children)
    input: tok, a text  that is already tokenised
    output: list of token that are children of tok"""
    new = [tok] # list of token
    while new:
        tok = new.pop()
        yield tok
        for child in tok.children:
            if( child.dep_ == 'conj')|( child.dep_ == 'prep'):# chek if the Syntactic dependency relation equal is conjunction.
                new.append(child)

In [87]:
def get_left_span(tok, label='', include=True):
    offset = 1 if include else 0
    idx = tok.i
    while idx > tok.left_edge.i:
        if tok.doc[idx - 1].pos_ in ('NOUN', 'PROPN', 'ADJ', 'X'):
            idx -= 1
        else:
            break
    return label, idx, tok.i+offset

In [196]:
EXP_TERMS = ['experience','knowledge',"skills","language","background","familiarity","passion","degree"]
def extract_adp_conj_experience(doc, label='SKILLS'):
    for tok in doc:
        if tok.lower_ in EXP_TERMS:
            #loop into wold just after EXP_TERMS
            for child in tok.rights:
                if child.dep_ == 'prep': # if the word is tagged as a preposition
                    for obj in child.children: # take the wold after the preposition(children)
                        if obj.dep_ == 'pobj': # If  children are tagged as pobj
                            for conj in get_conjugations(obj):# take children with their conjugated(separated by a comma/conjonction)
                                yield get_left_span(conj, label)

In [197]:
def show_extraction(examples, *extractors):
    seen = set()
    for doc in nlp.pipe(examples):
        doc.ents = filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)])
        for tok in doc:
            if tok.lower_ == 'experience':
                sentence = tok.sent
                if sentence.text in seen:
                    continue
                seen.update([sentence.text])
                if not sentence.ents:
                    doc.ents = list(doc.ents) + [Span(doc, tok.i, tok.i+1, 'MISSING')]
                displacy.render(sentence, style='ent', options = {'colors': {'MISSING': 'pink',
                                                                            'EXPERIENCE': 'lightgreen',
                                                                            "SKILLS": "red"}})
                

In [198]:
# show_extraction(examples, extract_adp_conj_experience)

In [199]:
def extract_adp_conj_experience(doc,label):
    extacted_words = []
    extacted_word = []
    for tok in doc:
        if tok.lower_ ==label:
            for child in tok.rights:
                if (child.dep_ == 'prep')|(child.pos_ =="ADP"):
                    for obj in child.children:
                        if obj.dep_ == 'pobj':
                            for conj in get_conjugations(obj):
                                extacted_word.append(get_left_span(conj, label))
#                                 extacted_word.append(obj)
        extacted_words.append(extacted_word)
    return extacted_word                             

In [223]:
def extract_adp_conj_experience(doc,labels):
    extacted_words = []
    extacted_word = []
    for label in labels:
        for tok in doc:
            if tok.lower_ ==label:
                for child in tok.rights:
                    if (child.dep_ == 'prep')|(child.pos_ =="ADP"):
                        for obj in child.children:
                            if obj.dep_ == 'pobj':
                                for conj in get_conjugations(obj):
                                    extacted_word.append(conj)
#                                     extacted_word.append(child)
    return extacted_word                         

In [224]:
extract_adp_conj_experience(nlp(examples1),EXP_TERMS)

[languages, like, Scala]

In [225]:
examples1

'Infoserv LLC\nData Scientist\nRemote\nEmployer Provided Salary:$30.00 - $35.00 Per Hour\n\n Only W2 candidates right now we are unable to work with the employer\nProgramming Skills – knowledge of statistical programming languages like R, Python, and database query languages like SQL, Hive, Pig is desirable. Familiarity with Scala, Java, or C++ is an added advantage.\nJob Type: Contract\nPay: $30.00 - $35.00 per hour\nSchedule:\n8 hour shift\nSupplemental pay types:\nBonus pay\nWork Location: Remote'

In [238]:
# str(examples)

In [193]:
x = extract_adp_conj_experience(nlp(examples1),EXP_TERMS)
y = [str(i).strip() for i in x]
type(y)

list

In [194]:
highlight_terms(y[:100],examples)

#### Extract words in the left of words designating experinces or skills

In [230]:
def extract_noun_phrase_experience(doc):
    data = []
    for np in doc.noun_chunks:
        if (np[-1].lower_ == 'experience'):
            if len(np) > 1:
                data.append(np[0])
#                 yield 'EXPERIENCE', np[0].i, np[-1].i
    return data

In [250]:
x = list(nlp(examples1)) #list(nlp(str(examples)).noun_chunks)
str(x)

'[Infoserv, LLC, \n, Data, Scientist, \n, Remote, \n, Employer, Provided, Salary:$30.00, -, $, 35.00, Per, Hour, \n\n , Only, W2, candidates, right, now, we, are, unable, to, work, with, the, employer, \n, Programming, Skills, –, knowledge, of, statistical, programming, languages, like, R, ,, Python, ,, and, database, query, languages, like, SQL, ,, Hive, ,, Pig, is, desirable, ., Familiarity, with, Scala, ,, Java, ,, or, C++, is, an, added, advantage, ., \n, Job, Type, :, Contract, \n, Pay, :, $, 30.00, -, $, 35.00, per, hour, \n, Schedule, :, \n, 8, hour, shift, \n, Supplemental, pay, types, :, \n, Bonus, pay, \n, Work, Location, :, Remote]'

In [254]:
remove_punct(str(x))

('Infoserv LLC Data Scientist Remote Employer Provided Salary 30.00 35.00 Per Hour Only W2 candidates right now we are unable to work with the employer Programming Skills – knowledge of statistical programming languages like R Python and database query languages like SQL Hive Pig is desirable Familiarity with Scala Java or C++ is an added advantage Job Type Contract Pay 30.00 35.00 per hour Schedule 8 hour shift Supplemental pay types Bonus pay Work Location Remote',
 ['[',
  'Infoserv',
  ',',
  'LLC',
  ',',
  ',',
  'Data',
  ',',
  'Scientist',
  ',',
  ',',
  'Remote',
  ',',
  ',',
  'Employer',
  ',',
  'Provided',
  ',',
  'Salary',
  ':',
  '$',
  '30.00',
  ',',
  '-',
  ',',
  '$',
  ',',
  '35.00',
  ',',
  'Per',
  ',',
  'Hour',
  ',',
  ',',
  'Only',
  ',',
  'W2',
  ',',
  'candidates',
  ',',
  'right',
  ',',
  'now',
  ',',
  'we',
  ',',
  'are',
  ',',
  'unable',
  ',',
  'to',
  ',',
  'work',
  ',',
  'with',
  ',',
  'the',
  ',',
  'employer',
  ',',
  ',',
 

In [239]:
extract_noun_phrase_experience(nlp(str(examples)))

[#,
 some,
 strong,
 industry,
 overall,
 ER,
 overall,
 Knowledge,
 advantage\n\n-,
 Python.\n\n-,
 migration,
 \nPrior,
 3,
 Prior,
 Strong,
 Prior,
 Strong,
 \nSN,
 Prior,
 Strong,
 Prior,
 3,
 Prior,
 Strong,
 Prior,
 industry,
 and\nSaaS,
 security,
 data,
 large\nscale,
 AI,
 software,
 \n,
 \n,
 hands,
 \nAI,
 (,
 least,
 not,
 least,
 not,
 This,
 relevant,
 This,
 all,
 IT,
 the,
 relevant,
 and\ncausal,
 working,
 Applied,
 least,
 new,
 base,
 any,
 3,
 Prior,
 Strong,
 Prior,
 AI,
 customer,
 etc.\n•,
 Redshift\n•,
 etc.\n•,
 industry,
 hands,
 This]

In [227]:
# show_extraction(examples, extract_noun_phrase_experience)

In [38]:
def get_extractions(examples, *extractors):
    # Could use context instead of enumerate
    for idx, doc in enumerate(nlp.pipe(examples, batch_size=100, disable=['ner'])):
        for ent in filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)]):
            sent = ent.root.sent
            yield ent.text, idx, ent.start, ent.end, ent.label_, sent.start, sent.end

In [39]:
def extract_df(*extractors, n_max=None, **kwargs):
    if n_max is None:
        n_max = len(df)
    ent_df = pd.DataFrame(list(get_extractions(job_description[:n_max].Content, *extractors)),
                          columns=['text', 'docidx', 'start', 'end', 'skill', 'sent_start', 'sent_end'])
    return ent_df.merge(job_description, how='left', left_on='docidx', right_index=True)

In [40]:
extract_exps = [extract_adp_conj_experience,]
extract_exps

[<function __main__.extract_adp_conj_experience(doc, label='SKILLS')>]

In [41]:
df_ents = extract_df(*extract_exps, n_max=1000000)

In [42]:
ent_df = extract_df(extract_noun_phrase_experience, n_max=1000000)

In [50]:
all_skills = pd.concat([df_ents,ent_df],axis=0)
all_skills

Unnamed: 0,text,docidx,start,end,skill,sent_start,sent_end,Content,source,job_title,label,Id
0,Scala,0,59,60,SKILLS,57,71,Infoserv LLC\nData Scientist\nRemote\nEmployer...,glassdoor,Data Scientist,job_description,
1,multivariate statistics,1,493,495,SKILLS,490,547,"ExxonMobil\n3.1\nData Scientist\nClinton, NJ\n...",glassdoor,Data Scientist,job_description,
2,shell scripting,1,613,615,SKILLS,547,619,"ExxonMobil\n3.1\nData Scientist\nClinton, NJ\n...",glassdoor,Data Scientist,job_description,
3,following,1,624,625,SKILLS,619,646,"ExxonMobil\n3.1\nData Scientist\nClinton, NJ\n...",glassdoor,Data Scientist,job_description,
4,databases,1,658,659,SKILLS,646,665,"ExxonMobil\n3.1\nData Scientist\nClinton, NJ\n...",glassdoor,Data Scientist,job_description,
...,...,...,...,...,...,...,...,...,...,...,...,...
1904,management information Essential,1033,140,143,EXPERIENCE,91,146,Database Analyst ****K Wetherby This role sha...,Kaggle,Data Analyst,job_description,1904.0
1905,Posses solid commercial,1034,51,54,EXPERIENCE,51,69,Data Analyst / Data Analysis / Modelling / SQL...,Kaggle,Data Analyst,job_description,1905.0
1906,Solid,1034,482,483,EXPERIENCE,446,510,Data Analyst / Data Analysis / Modelling / SQL...,Kaggle,Data Analyst,job_description,1906.0
1907,desirable Data/market or modeling/intelligence,1034,499,507,EXPERIENCE,446,510,Data Analyst / Data Analysis / Modelling / SQL...,Kaggle,Data Analyst,job_description,1907.0


In [43]:
# right_skills = df_ents.groupby(by=["docidx","Content","source","job_title","label"]).agg({"text":"sum"})
# right_skills.reset_index(inplace=True)

In [44]:
# left_skills = ent_df.groupby(by=["docidx","Content","source","job_title","label"]).agg({"text":"sum"})
# left_skills.reset_index(inplace=True)

In [62]:
skills = all_skills.groupby(by=["docidx","Content","source","job_title","label"]).agg({"text":"sum"})
skills.reset_index(inplace=True)

In [63]:
skills

Unnamed: 0,docidx,Content,source,job_title,label,text
0,0,Infoserv LLC\nData Scientist\nRemote\nEmployer...,glassdoor,Data Scientist,job_description,Scala
1,1,"ExxonMobil\n3.1\nData Scientist\nClinton, NJ\n...",glassdoor,Data Scientist,job_description,multivariate statisticsshell scriptingfollowin...
2,2,eBay Inc.\n4.1\nData Scientist/Applied Researc...,glassdoor,Data Scientist,job_description,computer sciencemathScalaR
3,3,"TikTok\n3.7\nData Scientist, University Gradua...",glassdoor,Data Scientist,job_description,Software DevelopmentComputer ScienceComputer E...
4,4,"Mastercard\n4.3\nData Scientist, AI Services -...",glassdoor,Data Scientist,job_description,sideleadership
...,...,...,...,...,...,...
851,1031,Are you an experienced Data Analyst? Are you e...,Kaggle,Data Analyst,job_description,Data Analysismodelling techniquesData Processi...
852,1032,"Data Analyst Data extraction, Storage, Back u...",Kaggle,Data Analyst,job_description,data extractionstoragebackup methodologiesemai...
853,1033,Database Analyst ****K Wetherby This role sha...,Kaggle,Data Analyst,job_description,management information Essential
854,1034,Data Analyst / Data Analysis / Modelling / SQL...,Kaggle,Data Analyst,job_description,developerMicrosoft AccessSQL ServerdatabasesPo...


In [60]:
skills["text"] = skills["text"].str.split()

In [61]:
# left_skills["text"]
skills.head()

Unnamed: 0,docidx,Content,source,job_title,label,text
0,0,Infoserv LLC\nData Scientist\nRemote\nEmployer...,glassdoor,Data Scientist,job_description,[Scala]
1,1,"ExxonMobil\n3.1\nData Scientist\nClinton, NJ\n...",glassdoor,Data Scientist,job_description,"[multivariate, statisticsshell, scriptingfollo..."
2,2,eBay Inc.\n4.1\nData Scientist/Applied Researc...,glassdoor,Data Scientist,job_description,"[computer, sciencemathScalaR]"
3,3,"TikTok\n3.7\nData Scientist, University Gradua...",glassdoor,Data Scientist,job_description,"[Software, DevelopmentComputer, ScienceCompute..."
4,4,"Mastercard\n4.3\nData Scientist, AI Services -...",glassdoor,Data Scientist,job_description,[sideleadership]


In [56]:
skills["text"][3]

'Software DevelopmentComputer ScienceComputer EngineeringFinal yearrecent graduatewithSoftware DevelopmentComputer ScienceComputer Engineeringrelated technical discipline'