In [1]:
# Used to tokenize the text; i.e. create a dictionary mapping words to integers. The dictionary can be used to create a term-document matrix.
from gensim.corpora import Dictionary

from gensim.test.utils import datapath

from gensim.models.phrases import Phrases, Phraser
from gensim.models.phrases import ENGLISH_CONNECTOR_WORDS

import spacy
from textacy import extract

import re
import os

def create_list_from_csv(path):
    corpus = []
    with open(path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            columns = line.split(',')   
            # Columns 1 and 2 contain the company name and the job title, both guaranteed to not include commas, and both separated by a comma. 
            # We are not analyzing this information, so we can safely discard the first two columns.
            # The third "column" contains the job description, but it may contain commas, so we use ",".join() to concatenate all the columns after the second one.
            # csv.reader()'s quotechar parameter does not seem to work for whatever reason, and this just seemed faster. 
            description = ",".join(columns[2:]).strip('"')      # strip('"') to remove leading and trailing quotes
            corpus.append(description)

    return corpus

In [3]:
data = create_list_from_csv('../jobs.csv') 

In [7]:
print(len(data))
print(len(data[-1]))
print(data[-1][:200])

262
2051
 "What You'll Do Apply statistical and machine learning techniques to process and analyze unstructured textual data Develop and finetune machine learning models for tasks such as entity recognition, c


In [8]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])  

# Add custom stop words 
nlp.Defaults.stop_words |= {"experience", "preferred", "skill", "yelp", "strong", "work", "solutions", "drive", "insights", "use", "needs", "responsibilities", "do", "particularly", "related", "leak", "radio", }

In [9]:
def clean_without_ngrams(doc):

    spacy_doc = nlp(doc.lower())

    # Remove stopwords, punctuation, and numeric tokens
    tokens = [
        token.text 
        for token in spacy_doc 
        if not token.is_stop and not token.is_punct and not token.is_digit and token.is_alpha       # Keep only words that are not stop words
            and token.text not in ["_", "+", "=", "\n","-","*","<",">"]                             # Remove special characters       
            and not len(token.text) == 1                                                            # Remove single character words
            # and token.pos_ in ["NOUN", "ADJ", "VERB", "ADV"]                                        # Keep only nouns, adjectives, verbs, and adverbs
    ]    
                                                                           
    return tokens

In [22]:
def lemmatize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    tokens = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        sent_tokens = []
        for token in doc: 
            if "_" in token.text:
                sent_tokens.append(token.text)
            else:
                if token.pos_ in allowed_postags:
                    sent_tokens.append(token.lemma_)
                    
        sent_tokens = [token.replace("datum", "data") for token in sent_tokens]
        tokens.append(sent_tokens)

    return tokens

In [10]:
data_words = [clean_without_ngrams(doc) for doc in data]

In [23]:
bigram = Phrases(data_words, min_count=10, threshold=20) 
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = Phraser(bigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

data_words_bigrams = make_bigrams(data_words)

In [24]:
data_lemmatized = lemmatize(data_words_bigrams)

In [25]:
sum = 0
for doc in data_lemmatized:
    sum += len(doc)

print(f"Total number of words in the cleaned corpus: {sum}")

Total number of words in the cleaned corpus: 36738


In [26]:
print(len(data_lemmatized))
print(data_lemmatized[0][:20] + data_lemmatized[0][-20:])

262
['data', 'collection', 'cleaning', 'assist', 'collection', 'organization', 'data', 'data', 'clean', 'preprocessing', 'activity', 'ensure', 'data', 'accuracy', 'data', 'analysis', 'utilize', 'statistical', 'method', 'tool', 'familiarity', 'data', 'analysis', 'tool', 'power_bi', 'excel', 'attention_detail', 'analytical', 'skill', 'good', 'communication_skills', 'ability', 'collaboratively', 'team', 'eagerness', 'learn', 'adapt', 'new', 'technology', 'technique']


In [27]:
bigrams = []

for data_lemmatized_doc in data_lemmatized:    
    for word in data_lemmatized_doc:
        if "_" in word:
            bigrams.append(word)

In [28]:
print(f"Number of ngrams: {len(bigrams)}")
unique_bigrams = set(bigrams)
print(f"Number of unique ngrams: {len(unique_bigrams)}")
print(unique_bigrams)

Number of ngrams: 3008
Number of unique ngrams: 101
{'tensorflow_pytorch', 'industry_trends', 'open_source', 'product_managers', 'state_art', 'interpersonal_skills', 'apache_spark', 'communication_skills', 'deep_learning', 'comprehensive_documentation', 'continuous_integration', 'verbal_communication', 'best_practices', 'functional_teams', 'relational_databases', 'etl_elt', 'generative_ai', 'end_end', 'bachelor_master', 'updated_latest', 'computer_science', 'excellent_communication', 'high_quality', 'collect_clean', 'cloud_platforms', 'attention_detail', 'large_scale', 'qualifications_bachelor', 'collaborate_cross', 'stay_date', 'problem_solving', 'google_cloud', 'develop_maintain', 'decision_making', 'production_environments', 'excellent_written', 'non_technical', 'natural_language', 'solving_skills', 'mathematics_physics', 'apache_airflow', 'fine_tuning', 'gcp_services', 'docker_kubernetes', 'required_qualifications', 'degree_computer', 'solve_complex', 'cloud_composer', 'programming