# Connect to Drive

In [None]:
#connect to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# install & imports

In [None]:
!pip install pdfminer.six
!python -m spacy download en_core_web_lg

Collecting pdfminer.six
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20231228
Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'

In [None]:
# to work with DataFrames
import pandas as pd
# extract text from pdf
from pdfminer.high_level import extract_text
# clean text
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# info extraction
import spacy
from spacy import displacy
# topic modeling
from gensim import corpora, models

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# functions

## extract text from pdf

In [None]:
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

## clean text

In [None]:
def remove_html_tags(text):
    pattern = r'<(.*?)>'
    return re.sub(pattern, ' ', text)

def remove_html_entities(text):
    pattern = r'&\w+'
    return re.sub(pattern, ' ', text)

def replace_special_characters(text):
    pattern = r'[;:]|(\\r)|(\\n)'
    return re.sub(pattern, ' ', text)

def remove_extra_spaces(text):
    pattern = r'\s\s+?(?=\S)'
    return re.sub(pattern, ' ', text)

def replace_punctuation(text):
    punctuation = '!"#$%&\'()*,-./:;<=>?@[\\]^_`{|}~'
    return text.translate(str.maketrans(punctuation, ' ' * len(punctuation)))

def remove_non_ascii(text):
    return ''.join(char if ord(char) < 128 else ' ' for char in text)

def clean_text(text):
    text = remove_html_tags(text)
    text = remove_html_entities(text)
    text = replace_special_characters(text)
    text = replace_punctuation(text)
    text = remove_non_ascii(text)
    text = remove_extra_spaces(text)
    text = text.lower()
    return text


In [None]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

##extract informations

In [None]:
'''
note : the ruler has:
Number of degrees: 11
Number of majors: 268
Number of skills: 92983

'''
def get_skills(text):
    doc = nlp(text)
    skills = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            skills.append(ent.text)
    return skills

def unique_skills(x):
    return list(set(x))

def get_degree(text):
    doc = nlp(text)
    degrees = []
    for ent in doc.ents:
        if ent.label_ == "DEGREE":
            degrees.append(ent.text)
    return degrees

def extract_sentence_after_degree(text):
    # Define the pattern to match "degree", "diploma", or "certificate", followed by 15-20 words.
    pattern = r"(?i)(?:degree|certificate|diploma|bachelor|high\s*school|associate|master|bsc|vocational|phd)\W+(?:\w+\W+){0,30}\w+"
    # Search for the pattern in the text
    match = re.search(pattern, text)
    # If a match is found, return the extracted text; otherwise, return None
    if match:
        return match.group(0)
    else:
        return None

def get_majors(text):
    doc = nlp(text)
    majors = []
    for ent in doc.ents:
        if ent.label_ == "MAJOR":
            majors.append(ent.text)
    return majors



## topic modeling

In [None]:
# Load the LDA model
lda_model = models.LdaModel.load('/content/drive/MyDrive/LDA_models/lda_model_25')

def classify_text(lda_model, text):
    # Convert the new text to a bag-of-words vector
    new_text_bow = lda_model.id2word.doc2bow(text)

    # Get topic probabilities for the new text
    topic_probs = lda_model.get_document_topics(new_text_bow)  # List of tuples (topic ID, probability)

    # Sort topics by probability and extract the most dominant topic
    dominant_topic = max(topic_probs, key=lambda x: x[1])[0]

    return dominant_topic

# get datasets

In [None]:
users=pd.read_csv('/content/drive/MyDrive/data/chatgpt_datasets/users.csv')
apps=pd.read_csv('/content/drive/MyDrive/data/chatgpt_datasets/apps.csv')
jobs=pd.read_csv('/content/drive/MyDrive/data/chatgpt_datasets/jobs.csv')

In [None]:
users

Unnamed: 0,userID,Resume
0,1,Experienced software engineer with a degree in...
1,2,Marketing professional with expertise in socia...
2,3,Mechanical engineer specializing in product de...
3,4,Registered nurse with clinical experience in c...
4,5,Experienced elementary school teacher passiona...
...,...,...
495,496,Logistics manager with experience in supply ch...
496,497,UX/UI designer with a focus on user-centered d...
497,498,Corporate communications manager with expertis...
498,499,Chemical engineer specializing in process opti...


In [None]:
jobs

Unnamed: 0,jobID,Description,Requirement
0,1,Software Engineer,Bachelor's degree in Computer Science or relat...
1,2,Social Media Marketing Manager,"Bachelor's degree in Marketing, Communications..."
2,3,Human Resources Specialist,Bachelor's degree in Human Resources Managemen...
3,4,Graphic Designer,Bachelor's degree in Graphic Design or related...
4,5,Data Analyst,"Bachelor's degree in Statistics, Mathematics, ..."
...,...,...,...
495,496,User Experience Designer,"Bachelor's degree in Graphic Design, Human-Com..."
496,497,Public Relations Manager,"Bachelor's degree in Public Relations, Communi..."
497,498,Database Administrator,"Bachelor's degree in Computer Science, Informa..."
498,499,Email Marketing Specialist,"Bachelor's degree in Marketing, Communications..."


In [None]:
apps

Unnamed: 0,userID,jobID,Applied,Perfect Fit,Notes
0,1,1,Yes,Yes,The user is an experienced software engineer w...
1,1,6,Yes,Yes,With skills in cloud computing and full-stack ...
2,2,2,Yes,Yes,The user a marketing professional meets the re...
3,3,12,Yes,Yes,The mechanical engineer's expertise in CAD sof...
4,4,8,Yes,Yes,As a registered nurse with clinical experience...
...,...,...,...,...,...
972,451,496,Yes,Yes,Matches perfectly. Holds a degree in Logistics...
973,452,497,Yes,Yes,Matches perfectly. Holds a degree in Interacti...
974,454,498,Yes,Yes,Matches perfectly. Holds a degree in Communica...
975,455,499,Yes,Yes,Matches perfectly. Holds a degree in Chemical ...


# preprocess

## users

### clean text

In [None]:
users['clean_text']=users['Resume'].apply(clean_text)
users['non_stop_text']=users['clean_text'].apply(remove_stopwords)

In [None]:
users.head()

Unnamed: 0,userID,Resume,clean_text,non_stop_text
0,1,Experienced software engineer with a degree in...,experienced software engineer with a degree in...,experienced software engineer degree computer ...
1,2,Marketing professional with expertise in socia...,marketing professional with expertise in socia...,marketing professional expertise social media ...
2,3,Mechanical engineer specializing in product de...,mechanical engineer specializing in product de...,mechanical engineer specializing product desig...
3,4,Registered nurse with clinical experience in c...,registered nurse with clinical experience in c...,registered nurse clinical experience critical ...
4,5,Experienced elementary school teacher passiona...,experienced elementary school teacher passiona...,experienced elementary school teacher passiona...


### extract informations

In [None]:
nlp = spacy.load("en_core_web_lg")
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk("/content/drive/MyDrive/data/entity_rulers/skill_ruler.jsonl")
users['skills']=users['non_stop_text'].apply(get_skills)
users.head()

Unnamed: 0,userID,Resume,clean_text,non_stop_text,skills
0,1,Experienced software engineer with a degree in...,experienced software engineer with a degree in...,experienced software engineer degree computer ...,"[software, computer science, java, python, sta..."
1,2,Marketing professional with expertise in socia...,marketing professional with expertise in socia...,marketing professional expertise social media ...,"[marketing, social media, management, digital,..."
2,3,Mechanical engineer specializing in product de...,mechanical engineer specializing in product de...,mechanical engineer specializing product desig...,"[product design, manufacturing processes, mech..."
3,4,Registered nurse with clinical experience in c...,registered nurse with clinical experience in c...,registered nurse clinical experience critical ...,"[clinical, critical care, advanced, life support]"
4,5,Experienced elementary school teacher passiona...,experienced elementary school teacher passiona...,experienced elementary school teacher passiona...,"[student development, education, training, lit..."


In [None]:
nlp = spacy.load("en_core_web_lg")
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk("/content/drive/MyDrive/data/entity_rulers/degree_ruler.jsonl")
users['degrees']=users['non_stop_text'].apply(get_degree)
users.head()

Unnamed: 0,userID,Resume,clean_text,non_stop_text,skills,degrees
0,1,Experienced software engineer with a degree in...,experienced software engineer with a degree in...,experienced software engineer degree computer ...,"[software, computer science, java, python, sta...",[]
1,2,Marketing professional with expertise in socia...,marketing professional with expertise in socia...,marketing professional expertise social media ...,"[marketing, social media, management, digital,...",[]
2,3,Mechanical engineer specializing in product de...,mechanical engineer specializing in product de...,mechanical engineer specializing product desig...,"[product design, manufacturing processes, mech...",[]
3,4,Registered nurse with clinical experience in c...,registered nurse with clinical experience in c...,registered nurse clinical experience critical ...,"[clinical, critical care, advanced, life support]",[bachelor]
4,5,Experienced elementary school teacher passiona...,experienced elementary school teacher passiona...,experienced elementary school teacher passiona...,"[student development, education, training, lit...",[]


In [None]:
nlp = spacy.load("en_core_web_lg")
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk("/content/drive/MyDrive/data/entity_rulers/majors_ruler.jsonl")
users['majors']=users['non_stop_text'].apply(get_majors)
users.head()

Unnamed: 0,userID,Resume,clean_text,non_stop_text,skills,degrees,majors
0,1,Experienced software engineer with a degree in...,experienced software engineer with a degree in...,experienced software engineer degree computer ...,"[software, computer science, java, python, sta...",[],[computer science]
1,2,Marketing professional with expertise in socia...,marketing professional with expertise in socia...,marketing professional expertise social media ...,"[marketing, social media, management, digital,...",[],[]
2,3,Mechanical engineer specializing in product de...,mechanical engineer specializing in product de...,mechanical engineer specializing product desig...,"[product design, manufacturing processes, mech...",[],"[mechanical engineering, software engineering]"
3,4,Registered nurse with clinical experience in c...,registered nurse with clinical experience in c...,registered nurse clinical experience critical ...,"[clinical, critical care, advanced, life support]",[bachelor],[nursing]
4,5,Experienced elementary school teacher passiona...,experienced elementary school teacher passiona...,experienced elementary school teacher passiona...,"[student development, education, training, lit...",[],[]


### get dominant topic

In [None]:
users['topic'] = users['skills'].apply(lambda x: classify_text(lda_model, x))
users.head()

Unnamed: 0,userID,Resume,clean_text,non_stop_text,skills,degrees,majors,topic
0,1,Experienced software engineer with a degree in...,experienced software engineer with a degree in...,experienced software engineer degree computer ...,"[software, computer science, java, python, sta...",[],[computer science],12
1,2,Marketing professional with expertise in socia...,marketing professional with expertise in socia...,marketing professional expertise social media ...,"[marketing, social media, management, digital,...",[],[],19
2,3,Mechanical engineer specializing in product de...,mechanical engineer specializing in product de...,mechanical engineer specializing product desig...,"[product design, manufacturing processes, mech...",[],"[mechanical engineering, software engineering]",7
3,4,Registered nurse with clinical experience in c...,registered nurse with clinical experience in c...,registered nurse clinical experience critical ...,"[clinical, critical care, advanced, life support]",[bachelor],[nursing],8
4,5,Experienced elementary school teacher passiona...,experienced elementary school teacher passiona...,experienced elementary school teacher passiona...,"[student development, education, training, lit...",[],[],3


## jobs

In [None]:
jobs['text']=jobs['Description']+" "+jobs['Requirement']
jobs['clean_text']=jobs['text'].apply(clean_text)
jobs['non_stop_text']=jobs['clean_text'].apply(remove_stopwords)

nlp = spacy.load("en_core_web_lg")
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk("/content/drive/MyDrive/data/entity_rulers/degree_ruler.jsonl")
jobs['degrees']=jobs['non_stop_text'].apply(get_degree)

nlp = spacy.load("en_core_web_lg")
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk("/content/drive/MyDrive/data/entity_rulers/majors_ruler.jsonl")
jobs['majors']=jobs['non_stop_text'].apply(get_majors)

nlp = spacy.load("en_core_web_lg")
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk("/content/drive/MyDrive/data/entity_rulers/skill_ruler.jsonl")
jobs['skills']=jobs['non_stop_text'].apply(get_skills)
jobs['topic'] = jobs['skills'].apply(lambda x: classify_text(lda_model, x))
jobs.head()

Unnamed: 0,jobID,Description,Requirement,text,clean_text,non_stop_text,degrees,majors,skills,topic
0,1,Software Engineer,Bachelor's degree in Computer Science or relat...,Software Engineer Bachelor's degree in Compute...,software engineer bachelor s degree in compute...,software engineer bachelor degree computer sci...,[bachelor],[computer science],"[software, computer science, programming langu...",12
1,2,Social Media Marketing Manager,"Bachelor's degree in Marketing, Communications...",Social Media Marketing Manager Bachelor's degr...,social media marketing manager bachelor s degr...,social media marketing manager bachelor degree...,[bachelor],[communications],"[social media marketing, marketing, communicat...",19
2,3,Human Resources Specialist,Bachelor's degree in Human Resources Managemen...,Human Resources Specialist Bachelor's degree i...,human resources specialist bachelor s degree i...,human resources specialist bachelor degree hum...,[bachelor],[],"[human resources, human resources, management,...",15
3,4,Graphic Designer,Bachelor's degree in Graphic Design or related...,Graphic Designer Bachelor's degree in Graphic ...,graphic designer bachelor s degree in graphic ...,graphic designer bachelor degree graphic desig...,[bachelor],[],"[graphic design, adobe creative suite, print, ...",19
4,5,Data Analyst,"Bachelor's degree in Statistics, Mathematics, ...","Data Analyst Bachelor's degree in Statistics, ...",data analyst bachelor s degree in statistics m...,data analyst bachelor degree statistics mathem...,[bachelor],"[mathematics, computer science]","[data, statistics, mathematics, computer scien...",11


# prepare text for embeddings

In [None]:
def concatenate_features(row):
    degree_str = "degree: " + " ".join(row['degrees'])+" . " if row['degrees'] else ""
    major_str = "majors: " + " ".join(row['majors'])+" . " if row['majors'] else ""
    skills_str = "skills: " + " ".join(row['skills'])+" . " if row['skills'] else ""
    return f"{degree_str} {major_str} {skills_str}".strip()

In [None]:
users['text_emb'] = users.apply(concatenate_features, axis=1)

In [None]:
users

Unnamed: 0,userID,Resume,clean_text,non_stop_text,skills,degrees,majors,topic,text_emb
0,1,Experienced software engineer with a degree in...,experienced software engineer with a degree in...,experienced software engineer degree computer ...,"[software, computer science, java, python, sta...",[],[computer science],12,majors: computer science . skills: software c...
1,2,Marketing professional with expertise in socia...,marketing professional with expertise in socia...,marketing professional expertise social media ...,"[marketing, social media, management, digital,...",[],[],19,skills: marketing social media management digi...
2,3,Mechanical engineer specializing in product de...,mechanical engineer specializing in product de...,mechanical engineer specializing product desig...,"[product design, manufacturing processes, mech...",[],"[mechanical engineering, software engineering]",7,majors: mechanical engineering software engine...
3,4,Registered nurse with clinical experience in c...,registered nurse with clinical experience in c...,registered nurse clinical experience critical ...,"[clinical, critical care, advanced, life support]",[bachelor],[nursing],8,degree: bachelor . majors: nursing . skills:...
4,5,Experienced elementary school teacher passiona...,experienced elementary school teacher passiona...,experienced elementary school teacher passiona...,"[student development, education, training, lit...",[],[],3,skills: student development education training...
...,...,...,...,...,...,...,...,...,...
495,496,Logistics manager with experience in supply ch...,logistics manager with experience in supply ch...,logistics manager experience supply chain opti...,"[logistics, supply chain optimization, warehou...",[],[],16,skills: logistics supply chain optimization wa...
496,497,UX/UI designer with a focus on user-centered d...,ux ui designer with a focus on user centered d...,ux ui designer focus user centered design prin...,"[ux, ui, focus, user centered design, principl...",[],[],12,skills: ux ui focus user centered design princ...
497,498,Corporate communications manager with expertis...,corporate communications manager with expertis...,corporate communications manager expertise int...,"[corporate communications, internal external c...",[],"[communications, communications, communications]",19,majors: communications communications communic...
498,499,Chemical engineer specializing in process opti...,chemical engineer specializing in process opti...,chemical engineer specializing process optimiz...,"[process optimization, chemical engineering, p...",[],[chemical engineering],2,majors: chemical engineering . skills: proces...




In [None]:
jobs['text_emb'] = jobs.apply(concatenate_features, axis=1)

In [None]:
jobs

Unnamed: 0,jobID,Description,Requirement,text,clean_text,non_stop_text,degrees,majors,skills,topic,text_emb
0,1,Software Engineer,Bachelor's degree in Computer Science or relat...,Software Engineer Bachelor's degree in Compute...,software engineer bachelor s degree in compute...,software engineer bachelor degree computer sci...,[bachelor],[computer science],"[software, computer science, programming langu...",12,degree: bachelor . majors: computer science ....
1,2,Social Media Marketing Manager,"Bachelor's degree in Marketing, Communications...",Social Media Marketing Manager Bachelor's degr...,social media marketing manager bachelor s degr...,social media marketing manager bachelor degree...,[bachelor],[communications],"[social media marketing, marketing, communicat...",19,degree: bachelor . majors: communications . ...
2,3,Human Resources Specialist,Bachelor's degree in Human Resources Managemen...,Human Resources Specialist Bachelor's degree i...,human resources specialist bachelor s degree i...,human resources specialist bachelor degree hum...,[bachelor],[],"[human resources, human resources, management,...",15,degree: bachelor . skills: human resources h...
3,4,Graphic Designer,Bachelor's degree in Graphic Design or related...,Graphic Designer Bachelor's degree in Graphic ...,graphic designer bachelor s degree in graphic ...,graphic designer bachelor degree graphic desig...,[bachelor],[],"[graphic design, adobe creative suite, print, ...",19,degree: bachelor . skills: graphic design ad...
4,5,Data Analyst,"Bachelor's degree in Statistics, Mathematics, ...","Data Analyst Bachelor's degree in Statistics, ...",data analyst bachelor s degree in statistics m...,data analyst bachelor degree statistics mathem...,[bachelor],"[mathematics, computer science]","[data, statistics, mathematics, computer scien...",11,degree: bachelor . majors: mathematics comput...
...,...,...,...,...,...,...,...,...,...,...,...
495,496,User Experience Designer,"Bachelor's degree in Graphic Design, Human-Com...",User Experience Designer Bachelor's degree in ...,user experience designer bachelor s degree in ...,user experience designer bachelor degree graph...,[bachelor],[],"[user experience, graphic design, human comput...",19,degree: bachelor . skills: user experience g...
496,497,Public Relations Manager,"Bachelor's degree in Public Relations, Communi...",Public Relations Manager Bachelor's degree in ...,public relations manager bachelor s degree in ...,public relations manager bachelor degree publi...,[bachelor],"[communications, communications]","[public relations, public relations, communica...",19,degree: bachelor . majors: communications com...
497,498,Database Administrator,"Bachelor's degree in Computer Science, Informa...",Database Administrator Bachelor's degree in Co...,database administrator bachelor s degree in co...,database administrator bachelor degree compute...,[bachelor],[computer science],"[database, computer science, information techn...",12,degree: bachelor . majors: computer science ....
498,499,Email Marketing Specialist,"Bachelor's degree in Marketing, Communications...",Email Marketing Specialist Bachelor's degree i...,email marketing specialist bachelor s degree i...,email marketing specialist bachelor degree mar...,[bachelor],[communications],"[marketing, marketing, communications, marketi...",19,degree: bachelor . majors: communications . ...




# change indexes

for mapping and graph creation , IDs need to be in range 0-(len(df)-1) , so we need IDs to start from 0 and not 1 like they are originally.

In [None]:
users['userID']=users['userID']-1
jobs['jobID']=jobs['jobID']-1
apps['userID']=apps['userID']-1
apps['jobID']=apps['jobID']-1

In [None]:
apps

Unnamed: 0,userID,jobID,Applied,Perfect Fit,Notes
0,0,0,Yes,Yes,The user is an experienced software engineer w...
1,0,5,Yes,Yes,With skills in cloud computing and full-stack ...
2,1,1,Yes,Yes,The user a marketing professional meets the re...
3,2,11,Yes,Yes,The mechanical engineer's expertise in CAD sof...
4,3,7,Yes,Yes,As a registered nurse with clinical experience...
...,...,...,...,...,...
972,450,495,Yes,Yes,Matches perfectly. Holds a degree in Logistics...
973,451,496,Yes,Yes,Matches perfectly. Holds a degree in Interacti...
974,453,497,Yes,Yes,Matches perfectly. Holds a degree in Communica...
975,454,498,Yes,Yes,Matches perfectly. Holds a degree in Chemical ...


# save to csv

In [None]:
users.to_csv('/content/drive/MyDrive/data/chatgpt_datasets/processed_users.csv',index=False)
jobs.to_csv('/content/drive/MyDrive/data/chatgpt_datasets/processed_jobs.csv',index=False)
apps.to_csv('/content/drive/MyDrive/data/chatgpt_datasets/processed_apps.csv',index=False)