# Connect to Drive

In [None]:
#connect to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# imports

In [None]:
!pip install pdfminer.six
!python -m spacy download en_core_web_lg

Collecting pdfminer.six
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20231228
Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'

In [None]:
# extract text from pdf
from pdfminer.high_level import extract_text
# clean text
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# info extraction
import spacy
from spacy import displacy
from gensim import corpora, models

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# functions

## extract text from pdf

In [None]:
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

## clean text

In [None]:
def remove_html_tags(text):
    pattern = r'<(.*?)>'
    return re.sub(pattern, ' ', text)

def remove_html_entities(text):
    pattern = r'&\w+'
    return re.sub(pattern, ' ', text)

def replace_special_characters(text):
    pattern = r'[;:]|(\\r)|(\\n)'
    return re.sub(pattern, ' ', text)

def remove_extra_spaces(text):
    pattern = r'\s\s+?(?=\S)'
    return re.sub(pattern, ' ', text)

def replace_punctuation(text):
    punctuation = '!"#$%&\'()*,-./:;<=>?@[\\]^_`{|}~'
    return text.translate(str.maketrans(punctuation, ' ' * len(punctuation)))

def remove_non_ascii(text):
    return ''.join(char if ord(char) < 128 else ' ' for char in text)

def clean_text(text):
    text = remove_html_tags(text)
    text = remove_html_entities(text)
    text = replace_special_characters(text)
    text = replace_punctuation(text)
    text = remove_non_ascii(text)
    text = remove_extra_spaces(text)
    text = text.lower()
    return text


In [None]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

##extract informations

In [None]:
# Load the spaCy model and add the entity ruler
nlp = spacy.load("en_core_web_lg")
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk("/content/drive/MyDrive/data/degree_major_skills_ruler.jsonl")

'''
note : the ruler has:
Number of degrees: 11
Number of majors: 268
Number of skills: 92983

'''


def get_skills(text):
    doc = nlp(text)
    skills = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            skills.append(ent.text)
    return skills

def unique_skills(x):
    return list(set(x))

def get_degree(text):
   doc = nlp(text)
   degrees = []
   for ent in doc.ents:
        if ent.label_ == "DEGREE":
            degrees.append(ent.text)
   return degrees

def extract_sentence_after_degree(text):
    # Define the pattern to match "degree", "diploma", or "certificate", followed by 15-20 words.
    pattern = r"(?i)(?:degree|certificate|diploma|bachelor|high\s*school|associate|master|bsc|vocational|phd)\W+(?:\w+\W+){0,30}\w+"
    # Search for the pattern in the text
    match = re.search(pattern, text)
    # If a match is found, return the extracted text; otherwise, return None
    if match:
        return match.group(0)
    else:
        return None

def get_majors(text):
    doc = nlp(text)
    majors = []
    for ent in doc.ents:
        if ent.label_ == "MAJOR":
            majors.append(ent.text)
    return majors




In [None]:
# visualization
def render_entities(text):
    doc = nlp(text)
    # Filter entities for degrees, majors, and skills
    degree_entities = [ent for ent in doc.ents if ent.label_ == "DEGREE"]
    major_entities = [ent for ent in doc.ents if ent.label_ == "MAJOR"]
    skill_entities = [ent for ent in doc.ents if ent.label_ == "SKILL"]
    # Check for entities labeled as both major and skill
    overlapping_entities = set(major_entities) & set(skill_entities)
    # Render entities, prioritizing majors over skills in case of overlap
    for ent in doc.ents:
        if ent in degree_entities:
            ent.label_ = "DEGREE"
        elif ent in major_entities and ent not in overlapping_entities:
            ent.label_ = "MAJOR"
        elif ent in skill_entities and ent not in overlapping_entities:
            ent.label_ = "SKILL"
    # Define colors for degree, major, and skill entities
    colors = {"DEGREE": "#ff8080",  # Red for degree entities
              "MAJOR": "#80ffaa",  # Green for major entities
              "SKILL": "#d8edfe"}  # Blue for skill entities
    # Prepare options for visualization
    options = {"ents": ["DEGREE", "MAJOR", "SKILL"],
               "colors": colors,
               "compact": True,  # Set compact parameter to True for smaller frame
               "style": "ent",  # Specify style as entity
               "bg": "#ffffff",  # Set background color to white
               "color": "#000000",  # Set text color to black
               "font": "Arial",  # Set font to Arial
               "font_size": 8,  # Set font size to 10
               "width": 600,  # Set width of the frame
               "height": 200}  # Set height of the frame
    # Render the entities
    displacy.render(doc, style="ent", options=options)
    print("\n\n")

## topic modeling

In [None]:
# Load the LDA model
lda_model = models.LdaModel.load('/content/drive/MyDrive/LDA_models/lda_model_25')

def classify_text(lda_model, text):
    # Convert the new text to a bag-of-words vector
    new_text_bow = lda_model.id2word.doc2bow(text)

    # Get topic probabilities for the new text
    topic_probs = lda_model.get_document_topics(new_text_bow)  # List of tuples (topic ID, probability)

    # Sort topics by probability and extract the most dominant topic
    dominant_topic = max(topic_probs, key=lambda x: x[1])[0]

    return dominant_topic

# Example

## 1- extract text from PDF

In [None]:
resume_text = extract_text_from_pdf("/content/drive/My Drive/CV.pdf")
print(resume_text)

John Doe

Anytown, USA 00000

+1 (555) 123-4567

PROFESSIONAL EXPERIENCE

Family Nurse Practitioner

Anytown Medical Center, Anytown, USA

January 2020 - Present

Provided comprehensive primary care services to patients of all ages.

Collaborated with interdisciplinary teams to optimize patient outcomes.

Registered Nurse

Cityville General Hospital, Cityville, USA

June 2018 - December 2019

Delivered direct patient care in a medical-surgical unit, ensuring adherence to treatment plans.

EDUCATION

Master of Science in Nursing

University of Science, Anytown, USA

September 2016 - May 2018

Bachelor of Science in Nursing

University of Science, Anytown, USA

September 2012 - May 2016

SKILLS

Clinical assessment and diagnosis

Medication administration

Patient education

Electronic health record (EHR) documentation

AWARDS

Nursing Excellence Award, Anytown Medical Center

LANGUAGES

English (Native)




## 2- Clean text

### remove unnecessary characters

In [None]:
resume_clean = clean_text(resume_text)

print(resume_clean)

john doe anytown usa 00000 +1 555 123 4567 professional experience family nurse practitioner anytown medical center anytown usa january 2020 present provided comprehensive primary care services to patients of all ages collaborated with interdisciplinary teams to optimize patient outcomes registered nurse cityville general hospital cityville usa june 2018 december 2019 delivered direct patient care in a medical surgical unit ensuring adherence to treatment plans education master of science in nursing university of science anytown usa september 2016 may 2018 bachelor of science in nursing university of science anytown usa september 2012 may 2016 skills clinical assessment and diagnosis medication administration patient education electronic health record ehr documentation awards nursing excellence award anytown medical center languages english native 




### remove stopwords

In [None]:
resume_nonstop = remove_stopwords(resume_clean)

print(resume_nonstop)

john doe anytown usa 00000 +1 555 123 4567 professional experience family nurse practitioner anytown medical center anytown usa january 2020 present provided comprehensive primary care services patients ages collaborated interdisciplinary teams optimize patient outcomes registered nurse cityville general hospital cityville usa june 2018 december 2019 delivered direct patient care medical surgical unit ensuring adherence treatment plans education master science nursing university science anytown usa september 2016 may 2018 bachelor science nursing university science anytown usa september 2012 may 2016 skills clinical assessment diagnosis medication administration patient education electronic health record ehr documentation awards nursing excellence award anytown medical center languages english native


## 3- Information extraction

### Education extraction

#### Degree extraction

In [None]:
degree = get_degree(resume_nonstop)
print(degree)

['master', 'bachelor']


#### Major extraction

In [None]:
# Extract a sentence following education-related terms from a cleaned resume
sentence = extract_sentence_after_degree(resume_nonstop)
# Apply a function to extract majors from the extracted sentence
majors = get_majors(sentence)
print(majors)

['nursing', 'nursing']


### skill extraction

In [None]:
skills = get_skills(resume_nonstop)
print(skills)

['family', 'primary care', 'services', 'patient outcomes', 'direct patient care', 'medical surgical', 'adherence', 'education', 'clinical', 'assessment', 'medication administration', 'patient education', 'health', 'record', 'documentation', 'award', 'languages']


In [None]:
render_entities(resume_nonstop)






## 4- Topic modeling

In [None]:
topic = classify_text(lda_model,skills)

print(topic)

8


In [None]:
# print topic words :
topic_words = lda_model.show_topic(topic, topn=20)
print("Top words for topic :")
for word, prob in topic_words:
    print(word)

Top words for topic :
health
services
clinical
healthcare
hospital
family
plan
patient care
certification
support
pharmacy
education
case
children
hospitals
nurses
licensure
system
assessment
opportunities
