In [187]:
import spacy
from spacy.lang.en.examples import sentences
import pandas as pd
import nltk
# For Text Classification
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density

# Load the English language model in Spacy
# run `python -m spacy download en_core_web_sm` on the command line if you receive an error message
nlp = spacy.load("en_core_web_sm")
# Semantic Parsing
nltk.download('treebank')
nltk.download('propbank')

[nltk_data] Downloading package treebank to /Users/bpayne/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package propbank to /Users/bpayne/nltk_data...
[nltk_data]   Package propbank is already up-to-date!


True

# Syntactic Parsing

In [188]:
# Syntactic Parsing (NLTK implementation)
sentence = "The dog saw the man in the park"

## Syntactic categories are used to reflect grammatical structures within a sentence.

i. S = sentence
ii. NP = Noun phrase
iii. VP = Verb phrase
iv. PP = Prepositional Phrase
v. Det = Determiner
vi.	N = Noun
vii. V = Verb
viii. P = Preposition


In [189]:
# A sentence for parsing (with known ambiguity)
# "The dog saw the man in the park"

# Define a grammar in Chomsky Normal Form
grammar = nltk.CFG.fromstring("""
    S -> NP VP
    NP -> Det N | NP PP
    VP -> V NP | VP PP
    PP -> P NP
    Det -> 'the' | 'The'
    N -> 'dog' | 'man' | 'park'
    V -> 'saw'
    P -> 'in'
""")

# Create a parser using the grammar
parser = nltk.ChartParser(grammar)

# Tokenize the sentence
t = nltk.word_tokenize(sentence)

# Parse the sentence
for tree in parser.parse(t):
    tree.pretty_print()


                 S                          
      ___________|_______                    
     |                   VP                 
     |            _______|_______            
     |           VP              PP         
     |        ___|___         ___|___        
     NP      |       NP      |       NP     
  ___|___    |    ___|___    |    ___|___    
Det      N   V  Det      N   P  Det      N  
 |       |   |   |       |   |   |       |   
The     dog saw the     man  in the     park

                 S                          
      ___________|_______                    
     |                   VP                 
     |        ___________|___                
     |       |               NP             
     |       |        _______|___            
     |       |       |           PP         
     |       |       |        ___|___        
     NP      |       NP      |       NP     
  ___|___    |    ___|___    |    ___|___    
Det      N   V  Det      N   P  Det      N  

### Syntactic Parsing (explained)
The two trees above reflect the ambiguity in the sentence

Structural ambiguity is demonstrated when sentences can be interpreted in more than one way, for instance, when a PP (Prepositional Phrase) can be attached to either a NP (Noun Phrase) or a VP (Verb Phrase), which changes the meaning of the sentence. An example sentence presented in [1] is “The dog saw the man in the park” which can either mean that the man was in the park when the dog saw him, or the dog was in the park when it saw the man.

References:
[1] Bird, S., Klein, E., Loper, E. “Natural Language Processing with Python” https://www.nltk.org/book/ (accessed June 19, 2023)

# Dependency Parsing
## Noun Chunks

In [190]:
# Parse (Spacy Implementation)
doc = nlp(sentences[2])

# Check whether the object has been parsed
print("Parsed: ", doc.has_annotation("DEP"), "\n")

# Print the sentence
print ("Full Text: ", doc.text, '\n')

# Noun Chunks
for chunk in doc.noun_chunks:
    print("Text Chunk: ", chunk.text, "\n", "Root Text: ", chunk.root.text, "\n", "Root Dep: ", chunk.root.dep_, "\n", "Root Head Text: ",chunk.root.head.text)


Parsed:  True 

Full Text:  San Francisco considers banning sidewalk delivery robots 

Text Chunk:  San Francisco 
 Root Text:  Francisco 
 Root Dep:  nsubj 
 Root Head Text:  considers
Text Chunk:  sidewalk delivery robots 
 Root Text:  robots 
 Root Dep:  dobj 
 Root Head Text:  banning


# Dependency Parsing (continued)
## Navigating the Parse Tree

In [191]:
# Navigate the Parse Tree
for token in doc:
        print("Token: ", token.text, "\n", "Token Dep: ", token.dep_, "\n", "Token Head Text: ", token.head.text, "\n", "Token Head POS: ", token.head.pos_, "\n", [child for child in token.children])

Token:  San 
 Token Dep:  compound 
 Token Head Text:  Francisco 
 Token Head POS:  PROPN 
 []
Token:  Francisco 
 Token Dep:  nsubj 
 Token Head Text:  considers 
 Token Head POS:  VERB 
 [San]
Token:  considers 
 Token Dep:  ROOT 
 Token Head Text:  considers 
 Token Head POS:  VERB 
 [Francisco, banning]
Token:  banning 
 Token Dep:  xcomp 
 Token Head Text:  considers 
 Token Head POS:  VERB 
 [robots]
Token:  sidewalk 
 Token Dep:  compound 
 Token Head Text:  delivery 
 Token Head POS:  NOUN 
 []
Token:  delivery 
 Token Dep:  compound 
 Token Head Text:  robots 
 Token Head POS:  NOUN 
 [sidewalk]
Token:  robots 
 Token Dep:  dobj 
 Token Head Text:  banning 
 Token Head POS:  VERB 
 [delivery]


In [192]:
# Visualizing dependencies
from spacy import displacy

displacy.render(doc, style='dep')

## Dependency Parsing (explained)

When the sentence "San Francisco considers banning sidewalk delivery robots" is parsed to display dependency parsing, each line reflects a single token in that sentence with the text, the dependency label and its head (or the token it depends on).

In [201]:
# Beginning of Text Classification

data=pd.read_csv('../../data/resume-kaggle.csv')
resume_data=data.drop(columns=["ID","Resume_html"])
resume_data.head(10)


Unnamed: 0,Resume_str,Category
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR
1,"HR SPECIALIST, US HR OPERATIONS ...",HR
2,HR DIRECTOR Summary Over 2...,HR
3,HR SPECIALIST Summary Dedica...,HR
4,HR MANAGER Skill Highlights ...,HR
5,HR GENERALIST Summary Dedic...,HR
6,HR MANAGER Summary HUMAN RES...,HR
7,HR MANAGER Professional Summa...,HR
8,HR SPECIALIST Summary Posses...,HR
9,HR CLERK Summary Translates ...,HR


# Stemming

In [202]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *

pstemmer = PorterStemmer()

stemmer = SnowballStemmer(language='english')
toks = ['calculate', 'calculator', 'calculated', 'calculating']

# Porter
print("Porter Stemmer", "\n")
for tok in toks:
    print(tok + ' ----> ' + pstemmer.stem(tok))

print("\n", "Snowball Stemmer", "\n")
# Snowball
for tok in toks:
    print(tok + ' ---> ' + stemmer.stem(tok))



Porter Stemmer 

calculate ----> calcul
calculator ----> calcul
calculated ----> calcul
calculating ----> calcul

 Snowball Stemmer 

calculate ---> calcul
calculator ---> calcul
calculated ---> calcul
calculating ---> calcul


In [204]:
## Lemmatization

# Tokenization
resume_data['tokenized_text'] = resume_data['Resume_str'].apply(word_tokenize)

# Remove stopwords
stop_words = set(stopwords.words('english'))
resume_data['filtered_text'] = resume_data['tokenized_text'].apply(lambda x: [word.lower() for word in x if word.lower() not in stop_words])

# Lemmatization
lemmatizer = WordNetLemmatizer()
resume_data['lemmatized_text'] = resume_data['filtered_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Convert lemmatized text back to string
resume_data['processed_text'] = resume_data['lemmatized_text'].apply(lambda x: ' '.join(x))

resume_data.head()

Unnamed: 0,Resume_str,Category,tokenized_text,filtered_text,lemmatized_text,processed_text
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR,"[HR, ADMINISTRATOR/MARKETING, ASSOCIATE, HR, A...","[hr, administrator/marketing, associate, hr, a...","[hr, administrator/marketing, associate, hr, a...",hr administrator/marketing associate hr admini...
1,"HR SPECIALIST, US HR OPERATIONS ...",HR,"[HR, SPECIALIST, ,, US, HR, OPERATIONS, Summar...","[hr, specialist, ,, us, hr, operations, summar...","[hr, specialist, ,, u, hr, operation, summary,...","hr specialist , u hr operation summary versati..."
2,HR DIRECTOR Summary Over 2...,HR,"[HR, DIRECTOR, Summary, Over, 20, years, exper...","[hr, director, summary, 20, years, experience,...","[hr, director, summary, 20, year, experience, ...",hr director summary 20 year experience recruit...
3,HR SPECIALIST Summary Dedica...,HR,"[HR, SPECIALIST, Summary, Dedicated, ,, Driven...","[hr, specialist, summary, dedicated, ,, driven...","[hr, specialist, summary, dedicated, ,, driven...","hr specialist summary dedicated , driven , dyn..."
4,HR MANAGER Skill Highlights ...,HR,"[HR, MANAGER, Skill, Highlights, HR, SKILLS, H...","[hr, manager, skill, highlights, hr, skills, h...","[hr, manager, skill, highlight, hr, skill, hr,...",hr manager skill highlight hr skill hr departm...


# Text Classification

In [205]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(resume_data['processed_text'], resume_data['Category'], test_size=0.2, random_state=42)

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Train a LinearSVC model
lsvc = LinearSVC(C=0.1, dual=False, max_iter=1000)
lsvc.fit(X_train_tfidf, y_train)

# Train a Random Forest model
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)

# Train a K-Nearest Neighbour
knn = KNeighborsClassifier(n_neighbors=100)
knn.fit(X_train_tfidf, y_train)

# Train a Naive Bayes model
cnb = ComplementNB(alpha=0.1)
cnb.fit(X_train_tfidf, y_train)

# Train a Nearest Centroid model
nc = NearestCentroid()
nc.fit(X_train_tfidf, y_train)

# Train a Ridge Classifier model
rc = RidgeClassifier(alpha=1.0, solver="sparse_cg")
rc.fit(X_train_tfidf, y_train)

# Transform the testing data
X_test_tfidf = vectorizer.transform(X_test)

# Evaluate the models
lsvc_accuracy = lsvc.score(X_test_tfidf, y_test)
rf_accuracy = rf.score(X_test_tfidf, y_test)
knn_accuracy = knn.score(X_test_tfidf, y_test)
cnb_accuracy = cnb.score(X_test_tfidf, y_test)
nc_accuracy = nc.score(X_test_tfidf, y_test)
rc_accuracy = rc.score(X_test_tfidf, y_test)


In [206]:
# Print the results
print('Linear SVC Accuracy:', lsvc_accuracy)
print('Random Forest Accuracy', rf_accuracy)
print('KNN Accuracy', knn_accuracy)
print('Complement Naive Bayes Accuracy', cnb_accuracy)
print('Nearest Centroid Accuracy', nc_accuracy)
print('Ridge Classifier Accuracy', rc_accuracy)
print(f"    dimensionality: {lsvc.coef_.shape[1]}")
print(f"    density: {density(lsvc.coef_)}")


Linear SVC Accuracy: 0.6297786720321932
Random Forest Accuracy 0.613682092555332
KNN Accuracy 0.5633802816901409
Complement Naive Bayes Accuracy 0.5432595573440644
Nearest Centroid Accuracy 0.5814889336016097
Ridge Classifier Accuracy 0.6599597585513078
    dimensionality: 32685
    density: 1.0


# MaxEnt

In [207]:
# Maximum Entropy (Logistic Regression)
# "Maximum Entropy (ME) is a generalization of logistic regression for multi-class scenarios" Hsiang-Fu Yu, Fang-Lan Huang, Chih-Jen Lin (2011). Dual coordinate descent methods for logistic regression and maximum entropy models. Machine Learning 85(1-2):41-75. https://www.csie.ntu.edu.tw/~cjlin/papers/maxent_dual.pdf
# Display the Categories for classification
resume_data.value_counts("Category")


Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
ACCOUNTANT                118
ADVOCATE                  118
FINANCE                   118
ENGINEERING               118
CHEF                      118
FITNESS                   117
AVIATION                  117
SALES                     116
HEALTHCARE                115
CONSULTANT                115
BANKING                   115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64

In [208]:
# Train a Logistic Regression model
lr=LogisticRegression(penalty='l2', dual=False, tol=0.0001, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', C=5, max_iter=1000, multi_class='multinomial')

lr.fit(X_train_tfidf, y_train)

lr_accuracy = lr.score(X_test_tfidf, y_test)

print('Logistic Regression Accuracy', lr_accuracy)

Logistic Regression Accuracy 0.6458752515090543


# Semantic Parsing

In [209]:
# "As full semantic parsing is very hard to achieve at this time, NLP researchers turn to some “shallow” semantic parsing techniques. Semantic role labelling (SRL) models and algorithms have been developed to automatically label semantic arguments (or frame elements) associated with the predicate (or semantic frame) of a sentence" https://comp659r1.athabascau.ca/unit8/section2.php (accessed June 20, 2023)

# 1. Identify the predicate
# 2. Perform word sense disambiguation
# 3. Identify semantic arguments in the sentence

from nltk.corpus import propbank
from nltk.corpus import treebank

# PropBank Corpus provides predicate-argument annotation for the Penn Treebank
pb_instances = propbank.instances()
inst = pb_instances[193]
# Sentence is "South Korea's economic boom which began in 1986 stopped this year because of labor disputes, trade conflicts and sluggish exports"

print("Roleset: ", inst.roleset, "\n")
print("Predicate:", inst.predicate, "\n")
print("Arguments: \n", inst.arguments, "\n")

tree = inst.tree
assert tree == treebank.parsed_sents(inst.fileid)[inst.sentnum]
print ("\n Tree: ", tree, "\n")

# Identify the predicate:
print("Predicate Select: ", inst.predicate.select(tree), "\n")


# Arguments:
print("Arguments: \n")
for(argloc, argid) in inst.arguments:
    print('%-10s %s' % (argid, argloc.select(tree).pformat(500)[:50]))

# Frameset files, which define the argument labels
print("\n Frameset: \n")
expose_01 = propbank.roleset('expose.01')
turn_01 = propbank.roleset('turn.01')
print(turn_01)
for role in turn_01.findall("roles/role"):
    print(role.attrib['n'], role.attrib['descr'])


Roleset:  stop.01 

Predicate: 12:0 

Arguments: 
 ((PropbankTreePointer(0, 3), 'ARG1'), (PropbankTreePointer(13, 1), 'ARGM-TMP'), (PropbankTreePointer(15, 1), 'ARGM-CAU')) 


 Tree:  (S
  (NP-SBJ
    (NP
      (NP (NNP South) (NNP Korea) (POS 's))
      (JJ economic)
      (NN boom))
    (, ,)
    (SBAR
      (WHNP-12 (WDT which))
      (S
        (NP-SBJ (-NONE- *T*-12))
        (VP (VBD began) (PP-TMP (IN in) (NP (CD 1986))))))
    (, ,))
  (VP
    (VBD stopped)
    (NP-TMP (DT this) (NN year))
    (PP-PRP
      (IN because)
      (IN of)
      (NP
        (NP (VBN prolonged) (NN labor) (NNS disputes))
        (, ,)
        (NP (NN trade) (NNS conflicts))
        (CC and)
        (NP (JJ sluggish) (NNS exports)))))
  (. .)) 

Predicate Select:  (VBD stopped) 

Arguments: 

ARG1       (NP-SBJ (NP (NP (NNP South) (NNP Korea) (POS 's)) 
ARGM-TMP   (NP-TMP (DT this) (NN year))
ARGM-CAU   (PP-PRP (IN because) (IN of) (NP (NP (VBN prolonge

 Frameset: 

<Element 'roleset' at 0x14017fab0>
