In [1]:
#!pip install keybert

Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Installing collected packages: keybert
Successfully installed keybert-0.8.5


In [27]:
from keybert import KeyBERT

from gensim.models import LdaModel
from gensim.test.utils import datapath

from gensim.corpora import Dictionary
import pyLDAvis.gensim_models

### Loading data

In [4]:
import os

def combine_text_files_to_list(input_directory):

    txt_files = [os.path.join(input_directory, file) for file in os.listdir(input_directory) if file.endswith(".txt")]
    corpus = []

    for txt_file in txt_files:
        
        try:
            # Read the entire file as a string and add the string to the corpus
            with open(txt_file, 'r', encoding='utf-8') as file:
                file_content = file.read()  
                corpus.append(file_content)  
                
        except Exception as e:
            print(f"An error occurred while reading {txt_file}: {e}")
    
    return corpus

modules = combine_text_files_to_list("../Dataset/Parsed_Slides")
print("Corpus combined successfully as a list of strings.")

Corpus combined successfully as a list of strings.


In [5]:
print(len(modules))

23


In [6]:
def create_list_from_csv(path):
    corpus = []
    with open(path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            columns = line.split(',')   
            # Columns 1 and 2 contain the company name and the job title, both guaranteed to not include commas, and both separated by a comma. 
            # We are not analyzing this information, so we can safely discard the first two columns.
            # The third "column" contains the job description, but it may contain commas, so we use ",".join() to concatenate all the columns after the second one.
            # csv.reader()'s quotechar parameter does not seem to work for whatever reason, and this just seemed faster. 
            description = ",".join(columns[2:]).strip('"')      # strip('"') to remove leading and trailing quotes
            corpus.append(description)

    return corpus

In [7]:
job_descriptions = create_list_from_csv('jobs.csv') 

In [8]:
print(len(job_descriptions))

262


### Keyword extraction

In [9]:
model = KeyBERT()   # Here we use the default embedding model

In [10]:
for module in modules:
    keywords = model.extract_keywords(module, keyphrase_ngram_range=(1, 3), stop_words='english')
    print(keywords)

[('computers represent data', 0.6239), ('data memory', 0.6137), ('data computer encoded', 0.5903), ('technologies storing data', 0.5883), ('data computer', 0.5762)]
[('analysis basics rstudio', 0.5192), ('write tests function', 0.4577), ('expect_error fahr_to_celsius', 0.4547), ('function make tests', 0.4541), ('error expect_error fahr_to_celsius', 0.4537)]
[('structure traversal trees', 0.7225), ('data structure traversal', 0.705), ('tree data structure', 0.7004), ('data structures looked', 0.6892), ('data structures', 0.6858)]
[('uc python testing', 0.6879), ('unittest python standard', 0.6413), ('unittest python', 0.6354), ('python testing', 0.6267), ('ubc python testing', 0.6235)]
[('web scraping vs', 0.6499), ('web scrapers web', 0.6407), ('scraping explain web', 0.6383), ('web data', 0.636), ('web scraping understand', 0.6294)]
[('sql declarative language', 0.6833), ('sql standard language', 0.6753), ('sql query languages', 0.666), ('language sql standard', 0.6473), ('sql standar

In [16]:
jobs_dataset_keywords = set()

for desc in job_descriptions:
    job_keywords = []
    job_keywords_tuple = model.extract_keywords(desc)
    for job_keyword in job_keywords_tuple:
        job_keywords.append(job_keyword[0])

    job_keywords = set(job_keywords)
    jobs_dataset_keywords = jobs_dataset_keywords.union(job_keywords)

print(jobs_dataset_keywords)

{'databases', 'applying', 'microsoft', 'etls', 'developer', 'ideas', 'hdfs', 'mentor', 'insights', 'phd', 'agent', 'database', 'collaborate', 'engineering', 'supervised', 'debian', 'biologics', 'hospitality', 'linux', 'architecture', 'ml', 'visualizations', 'marketing', 'technologies', 'dashboards', 'pyspark', 'analyses', 'stakeholder', 'responsibilities', 'enterprise', 'sap', 'pipelines', 'gcp', 'crimes', 'gpu', 'administrative', 'healthcare', 'cuda', '5g', 'developing', 'schemas', 'advancements', 'vertexai', 'cloud', 'llms', 'technologist', 'data', 'databricks', 'affirm', 'arima', 'workflow', 'skills', 'forecasting', 'programming', 'gaming', 'collaborators', 'learning', 'internship', 'interns', 'software', 'bottlenecks', 'automated', 'sports', 'generative', 'acquisition', 'pricing', 'iot', 'tensorflow', 'productionizing', 'scrum', 'benchsci', 'communications', 'clio', 'amd', 'chatbots', 'researcher', 'prerequisite', 'development', 'spark', 'qualification', 'project', 'managing', 'mlf

In [17]:
from pprint import pprint
print(len(jobs_dataset_keywords))
pprint(jobs_dataset_keywords)

287
{'3d',
 '5g',
 'academic',
 'acquisition',
 'administrative',
 'advancements',
 'advertising',
 'affirm',
 'agent',
 'agents',
 'agile',
 'ai',
 'aiml',
 'amd',
 'analyses',
 'analyst',
 'analysts',
 'analytics',
 'analyze',
 'analyzing',
 'anddataanalytics',
 'antibody',
 'applying',
 'architectural',
 'architecture',
 'architectures',
 'arima',
 'asr',
 'attributes',
 'automate',
 'automated',
 'automation',
 'autonomy',
 'aws',
 'azure',
 'azureml',
 'bachelors',
 'backend',
 'backtesting',
 'benchsci',
 'bi',
 'bigquery',
 'bioinformatics',
 'biologics',
 'boosting',
 'bottlenecks',
 'build',
 'business',
 'businesses',
 'capabilities',
 'catalog',
 'certification',
 'chatbot',
 'chatbots',
 'clients',
 'clio',
 'cloud',
 'cloudera',
 'cloudformation',
 'coinbase',
 'collaborate',
 'collaborating',
 'collaborators',
 'communications',
 'competencies',
 'consultants',
 'consulting',
 'consumer',
 'conversational',
 'cresta',
 'crime',
 'crimes',
 'cuda',
 'cv',
 'cybersecurity',

In [20]:
important_keywords = ["agile", "architecture", "arima", "aws", "azure", "bachelors", "bigquery", "chatbots", "cloud", "collaborate", "dashboards", "database", "databricks", "cuda", "deploying", "development", "devops", "documentation", "ec2", "entrepreneurial", "etl", "excel", "experience", "forecasting", "gcp", "generative", "git", "gpu", "hadoop", "hive", "hiveql", "infrastructure", "insights", "kubernetes", "learning", "linux", "llms", "managing", "mapreduce", "masters", "mathematics", "mining", "ml", "mlops", "models", "modeling", "nlp", "nosql", "optimize", "phd", "pipelines", "postgresql", "predict", "programming", "python" ,"pyspark", "rdbms", "redis", "reports", "scala", "scalable", "schema", "scrum", "snowflake", "software", "spark", "sql", "sqlserver", "stakeholders", "supervised", "tableau", "tensorflow", "terraform", "training", "virtualization", "workflow"]

In [21]:
print(len(important_keywords))

76


We can try finding these keywords in the lectures corpus