In [2]:
# Checking which environment we are working in
# !conda info --envs # check which env we are in
# !conda activate HCP-env # Activate the right environment

In [3]:
import sys
import os
import codecs
import re
import copy
import collections

import numpy as np
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
import json


In [4]:
nltk.download('stopwords') # downloads all stopwords to avoid

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\15713\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords 

In [6]:
# getting a list of unecessary words like 'the', 'or',... to avoid
additional_words = ['edu', 'transcript', 'hour', 'program', 'experience', 'work', 'must', 'provide']
with open('add_stopwords.txt') as f:
    lines = f.readlines()
add_stopwords = lines[0].split(',')

avoid = stopwords.words('english') + add_stopwords + additional_words# adding our own word to the list


In [7]:
nltk.download('punkt') # downloads the tokenizer we need

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\15713\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
# Function to tokenize a text file into sentences or phrases
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def get_sentence_counter(path=None, text=None):
    if path:
        path = os.getcwd()
        for file in os.listdir():
            if file.endswith('.txt'):
                with codecs.open(file, 'r') as f: 
                    text = f.read()
    else:
        text = text
    # raw_tokens = tokenizer.tokenize(text)
    # tokens = []
    # for token in raw_tokens:
    #     if '\n' in token:
    #         tmp = token.split('\n')
    #         for sentence in tmp:
    #             tokens.append(sentence)

    # # tmp = [[x for x in token.split('\n')] for token in raw_sentences]

    return tokenizer.tokenize(text)

In [9]:
# Function to tokenize a text file into words
def remove_stopwords(text, avoid):
#     tokens = WordPunctTokenizer().tokenize(PorterStemmer().stem(text))
    try:
        tokens = WordPunctTokenizer().tokenize(text)
        tokens = list(map(lambda x: x.lower(), tokens)) # converting everything to lower-case
        
        # cleaning up tokens by getting rid of stopwords and finding word characters
        tokens = [token for token in tokens if token not in avoid]

        sentence = ''
        for token in tokens:
            sentence = sentence + token + ' '
    except:
        sentence= ''
    return sentence

In [11]:
################################################################################
################################################################################
################################################################################
# INDEED

In [12]:
# Working with 'Master_Indeed_Dataset'
path = r'C:\Users\15713\Desktop\Datasets\csv_xlsx\Master_Indeed_Dataset.xlsx'
df = pd.read_excel(path)

In [13]:
df.head(2)

Unnamed: 0,Title,Company,Location,Links,Salary,Description
0,JUNIOR PENETRATION TESTER,Elevate,Remote,https://indeed.com/viewjob?jk=0a12694d75919f0c,$35 - $50 an hour,JUNIOR PENETRATION TESTER\nwww.elevateconsult....
1,Junior Penetration Tester USA Remote,BreachLock,Floridaâ€¢Remote,https://indeed.com/viewjob?jk=1c98609eab9789cc,,The Penetration Testing professional should ha...


In [14]:
# Creating a dataframe with title as index and description as an attribute
title, description = df['Title'].to_list(), df['Description'].to_list()
indeed_df = pd.DataFrame(data={"Title": title,
                               "Description": description} 
                         )
# removing rows with 'None' for their Job-Title
# indeed_df = indeed_df[indeed_df.Title != 'None']

In [16]:
indeed_df.head(3)

Unnamed: 0,Title,Description
0,JUNIOR PENETRATION TESTER,JUNIOR PENETRATION TESTER\nwww.elevateconsult....
1,Junior Penetration Tester USA Remote,The Penetration Testing professional should ha...
2,Red Team - Penetration Tester,Who are we?\nBreachLock is a security startup ...


In [17]:
# Converting the above dataframe into dictionary for iteration {title: description}
indeed = {}
cnt,cnt_=0,0
for (t, d) in zip(title, description): # loops through the list title and description
    if t in indeed and t!='None': # if the job title already exists, we lump the descriptions together
        cnt_+=1
        indeed[f'{t}_{cnt_}'] = d
    elif t=='None':
        cnt+=1
        indeed[f'{t}_{cnt}'] = d 
    else:
        indeed[t] = d # sets the title as key and the description as the value


In [19]:
# iterating through the Job's to tokenize into sentences/phrases
indeed_tokenized_dict = {}
for key in indeed:
    # if key != 'None':
    text = indeed[key]
    # Tokenize into sentence or phrase
    sentence_count = get_sentence_counter(text=text)
    indeed_tokenized_dict[key] = sentence_count
    


In [22]:
# Putting in dataframe with keys/job titles as titles and the values as the tokenized sentences just for visual purposes
df_concat = pd.DataFrame()
for k,v in indeed_tokenized_dict.items():
    # if k != 'None':
    df = pd.DataFrame({k: v})
    df_concat = pd.concat([df_concat, df], axis=1)



In [23]:
# df_concat.T.head(13)

In [24]:
# Here I will try to create 'bag of words' that could be used to identify if a sentence is refering to 
# ability, skill, knowledge or other
bag_A = [' able ', ' ability', ' abilities', 'perform']
bag_S = ['skill']
bag_K = [' know', 'known', 'knowledge', 'Knowledge']
bag_C = ['certification']
general_bag = ['understand', 'perform','execute', 'basic', 'capable', 'conduct','responsible']
storage = general_bag + bag_A+bag_C+bag_K+bag_S+bag_A
print(storage)
# Helper function to check whether the flag words are present in the sentence
def check_word(word_list,sentence):
    for word in word_list:
        return True if re.findall(word, text.lower()) else False


# helper function to extract certs if they are listed out in job descriptions by using a reference list from NIST 
def find_cert(sentence, cert_list):
    for cert in cert_list:
        tmp = cert.lower().replace('certified', '')
        if tmp in sentence:
            return cert
# Loading a simplified list of certs found from NIST
path = r'C:\Users\15713\Desktop\Datasets\csv_xlsx\cert_list.xlsx' 
cert_list = pd.read_excel(path, sheet_name='names') 
cert_list = cert_list['Name'].to_list()


['understand', 'perform', 'execute', 'basic', 'capable', 'conduct', 'responsible', ' able ', ' ability', ' abilities', 'perform', 'certification', ' know', 'known', 'knowledge', 'Knowledge', 'skill', ' able ', ' ability', ' abilities', 'perform']


In [36]:
# Now I will sort out the sentences into Ability, skill, knowledge or Certs and save it as a nested dict
Master_indeed_dict = {}
pattern = '[\s|^|^\s][A-Z(][A-Z]+[\s|)|,]'
# I_dict = {}
for k in indeed_tokenized_dict.keys():
    # print(f'k = {k}')
    K,S,A,C,All = [],[],[],[],[]
    for sent in indeed_tokenized_dict[k]:
        # print(f"sent ={sent} +'\n'")
        if check_word(storage,sent):
            sentence = remove_stopwords(sent, avoid) # Removing stopwords and reconstructing sentence
            # print(f"sentence ={sentence} +'\n'")
            All.append(sentence)
            # print(f"All ={All} +'\n'")
            # break
        if check_word(bag_K,sent):
            sentence = remove_stopwords(sent, avoid) # Removing stopwords and reconstructing sentence
            K.append(sentence)
        if check_word(bag_S,sent):
            sentence = remove_stopwords(sent, avoid) # Removing stopwords and reconstructing sentence
            S.append(sentence)
        if check_word(bag_A,sent):
            sentence = remove_stopwords(sent, avoid) # Removing stopwords and reconstructing sentence
            A.append(sentence)
        
        # Here, I tried to use regex to find patterns matching a valid Certificate by first
        # finding a sentence that contains the flag words
        if check_word(bag_C, sent):
            # print(f'\nsentence: {sent}') #printing the sentence to be checked for valid certs
            match = re.findall(pattern, sent) #checking words matching our pattern
            # print('found certs...')
            for x in match:
                x = re.sub('\W', '', x) #removes any non-word characters (space,symbols)
                C.append(x) #adding cleaned up string to the certs list
                # print(f'-{x}')
        # break
  
    Master_indeed_dict[k] = {'All':All,
                             'K':K,
                             'S':S,
                             'A':A,
                             'C':list(set(C))
                            }
   
# Exporting dictionary as a json file for use during sentence vectorization
with open('Master_indeed_dict.json', 'w') as f:
    json.dump(Master_indeed_dict, f)
    

In [37]:
# For Visualization
I_df =  pd.DataFrame.from_dict({(i,j): Master_indeed_dict[i][j] 
                           for i in Master_indeed_dict.keys() 
                           for j in Master_indeed_dict[i].keys()},
                       orient='index')

In [38]:
I_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130,131,132,133,134,135,136,137,138,139
"(JUNIOR PENETRATION TESTER, All)",junior penetration tester www . elevateconsult...,"consists self managed , high caliber professio...","value exceptional client , solving coaching cl...","looking motivated , experienced self managed j...",contract hire starts competitive hourly rate e...,elevate looking junior penetration tester ( â ...,junior penetration tester assessments clients ...,"ideal candidate think creatively , independent...",seek enjoy learning add skillset quickly probl...,"interacts frequently clients , engagement , fi...",...,,,,,,,,,,
"(JUNIOR PENETRATION TESTER, K)",junior penetration tester www . elevateconsult...,"consists self managed , high caliber professio...","value exceptional client , solving coaching cl...","looking motivated , experienced self managed j...",contract hire starts competitive hourly rate e...,elevate looking junior penetration tester ( â ...,junior penetration tester assessments clients ...,"ideal candidate think creatively , independent...",seek enjoy learning add skillset quickly probl...,"interacts frequently clients , engagement , fi...",...,,,,,,,,,,
"(JUNIOR PENETRATION TESTER, S)",junior penetration tester www . elevateconsult...,"consists self managed , high caliber professio...","value exceptional client , solving coaching cl...","looking motivated , experienced self managed j...",contract hire starts competitive hourly rate e...,elevate looking junior penetration tester ( â ...,junior penetration tester assessments clients ...,"ideal candidate think creatively , independent...",seek enjoy learning add skillset quickly probl...,"interacts frequently clients , engagement , fi...",...,,,,,,,,,,
"(JUNIOR PENETRATION TESTER, A)",junior penetration tester www . elevateconsult...,"consists self managed , high caliber professio...","value exceptional client , solving coaching cl...","looking motivated , experienced self managed j...",contract hire starts competitive hourly rate e...,elevate looking junior penetration tester ( â ...,junior penetration tester assessments clients ...,"ideal candidate think creatively , independent...",seek enjoy learning add skillset quickly probl...,"interacts frequently clients , engagement , fi...",...,,,,,,,,,,
"(JUNIOR PENETRATION TESTER, C)",PENETRATION,OS,ABOUT,CPT,GPEN,CMWAPT,LPT,CRTOP,TIA,IT,...,,,,,,,,,,


In [39]:
################################################################################
################################################################################
################################################################################
# USAJobs

# Working with 'Master_Indeed_Dataset'
path = r'C:\Users\15713\Desktop\Datasets\csv_xlsx\USA Jobs Dataset.xlsx'
df = pd.read_excel(path)

df.head(2)



Unnamed: 0,Title,Salary,Organization,Job Description
0,IT Cybersecurity Specialist,,DEPARTMENT OF HOMELAND SECURITY,Duties\nSummary\nWho May Be Considered:\n\nU.S...
1,IT Cybersecurity Specialist INFOSEC,"$92,143 to $141,548 per year",DEPARTMENT OF HOMELAND SECURITY,Duties\nSummary\nThis announcement is issued u...


In [40]:
# Creating a dataframe with title as index and description as an attribute
title, description = df['Title'].to_list(), df['Job Description'].to_list()
usaJ_df = pd.DataFrame(data={"Title": title,
                               "Description": description} 
                         )
usaJ_df.head(3)

Unnamed: 0,Title,Description
0,IT Cybersecurity Specialist,Duties\nSummary\nWho May Be Considered:\n\nU.S...
1,IT Cybersecurity Specialist INFOSEC,Duties\nSummary\nThis announcement is issued u...
2,IT Cybersecurity Specialist (ENTARCH),Duties\nSummary\nThis announcement is issued u...


In [43]:
# Converting the above dataframe into dictionary for iteration {title: description}
usaJ = {}
cnt=0
for (t, d) in zip(title, description): # loops through the list title and description
    if t in usaJ and t!='None': # if the job title already exists, we lump the descriptions together
        usaJ[t]+=d
    elif t=='None':
        cnt+=1
        usaJ[f'{t}_{cnt}'] = d # sets the title as key and the description as the value
    else:
        usaJ[t] = d


# iterating through the Job's to tokenize into sentences/phrases
usaJ_tokenized_dict = {}
for key in usaJ:
    # if key != 'None':
    text = usaJ[key]
    # Tokenize into sentence or phrase
    sentence_count = get_sentence_counter(text=text)
    usaJ_tokenized_dict[key] = sentence_count

In [44]:
len(usaJ_tokenized_dict.keys())

58

In [45]:
# Now I will sort out the sentences into Ability, skill, knowledge or Certs and save it as a nested dict
USAJ_tokenized_dict = {}
pattern = '[\s|^|^\s][A-Z(][A-Z]+[\s|)|,]'
U_dict = {}
for k in usaJ_tokenized_dict.keys():
    K,S,A,C,All = [],[],[],[],[]
    for sentence in usaJ_tokenized_dict[k]:
        # if check_word(storage,sentence):
        s = remove_stopwords(sentence, avoid) # Removing stopwords and reconstructing sentence
        All.append(s)
        if check_word(bag_K,sentence):
            s = remove_stopwords(sentence, avoid) # Removing stopwords and reconstructing sentence
            K.append(s)
        if check_word(bag_S,sentence):
            s = remove_stopwords(sentence, avoid) # Removing stopwords and reconstructing sentence
            S.append(s)
        if check_word(bag_A,sentence):
            s = remove_stopwords(sentence, avoid) # Removing stopwords and reconstructing sentence
            A.append(s)
        
        # Here, I tried to use regex to find patterns matching a valid Certificate by first
        # finding a sentence that contains the flag words
        if check_word(bag_C, sentence):
            # print(f'\nsentence: {sentence}') #printing the sentence to be checked for valid certs
            match = re.findall(pattern, sentence) #checking words matching our pattern
            # print('found certs...')
            C.append(find_cert(sentence,cert_list))
            # print(f'-{find_cert(sentence,cert_list)}')
            for x in match:
                x = re.sub('\W', '', x) #removes any non-word characters (space,symbols)
                C.append(x) #adding cleaned up string to the certs list
                # print(f'-{x}')

  
    USAJ_tokenized_dict[k] = {'All':All,
                                'K':K,
                                'S':S,
                                'A':A,
                                'C':list(set(C))
                                }
    #for visualization purose only:
#     U_dict[k] = K+S+A+C
with open('USAJ_tokenized_dict.json', 'w') as f:
    json.dump(USAJ_tokenized_dict, f)
    

In [47]:
# len(USAJ_tokenized_dict.keys())

In [None]:
################################################################################
################################################################################
################################################################################
# SYLLABI

In [48]:
#working with the syllabi data from NICCS
path = r'C:\Users\15713\Desktop\Datasets\csv_xlsx\syllabi_niccs(final).csv'
NICCS = pd.read_csv(path, encoding='cp1252')

In [None]:
# NICCS.tail(3)

In [49]:
# Converting Columns needed to list for easier itteration
CourseTitle = NICCS['CourseTitle'].to_list()
WorkRole = NICCS['WorkRole'].to_list()
WorkRoleID = NICCS['WorkRoleID'].to_list()
Tasks = NICCS['WorkRoleDesc'].to_list()
Knowledge = NICCS['Knowledge'].to_list()
Skill = NICCS['Skills'].to_list()
Ability = NICCS['Abilities'].to_list()

In [50]:
# Saving each KSA under each work role as a nested dictionary
Syllabi_dict = {}
niccs_dict = {}
pattern = '[S|A|K][0-9]+[\S][\s]'
for i,WR in enumerate(WorkRole):
    K,S,A,All =[], [], [],[]
    if WR not in Syllabi_dict.keys(): # Avoiding duplicates
        # splitting to get each KSA & IDs, while also removing the A####,K####, S#### tags
        step_1 = re.sub(pattern, '', Knowledge[i]).split(".")
        for sent in step_1: # first split by period
            K+= remove_stopwords(sent, avoid).split(";") # take the list and split each text with ';' and store in a list
            
        step_1 = re.sub(pattern, '', Skill[i]).split(".")
        for sent in step_1: # first split by period
            S+= remove_stopwords(sent, avoid).split(";") # take the list and split each text with ';' and store in a list

        step_1 = re.sub(pattern, '', Ability[i]).split(".")
        for sent in step_1: # first split by period
            A+= remove_stopwords(sent, avoid).split(";") # take the list and split each text with ';' and store in a list
       
        
        All = K+S+A
        
        Syllabi_dict[WR] = {'ID': WorkRoleID[i],
                            'All': All, 
                             'K': K, 
                             'S': S, 
                             'A': A} #Saving as a nested dict
        # for visualization purposes only:
        niccs_dict[WR] = [WorkRoleID[i]] + K + S + A
        
# Exporting dictionary as a json file for use during sentence vectorization
with open('syllabi_NICCS.json', 'w') as f:
    json.dump(Syllabi_dict, f)

In [None]:
# for wr in Syllabi_dict.keys():
#     for ksa in Syllabi_dict[wr].keys():
#         if (len(Syllabi_dict[wr][ksa]))<=3:
#             print(Syllabi_dict[wr][ksa])

In [51]:
niccs_concat = pd.DataFrame()
for k,v in niccs_dict.items():
    if k != 'None':
        df = pd.DataFrame({k: v})
        niccs_concat = pd.concat([niccs_concat, df], axis=1)

niccs_concat.T.tail(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,169,170,171,172,173,174,175,176,177,178
Research and Development Specialist,SP-TRD-001,"computer networking concepts protocols , netwo...",risk processes ( e,g,", methods assessing mitigating risk )","laws , regulations , policies , ethics relate ...",cybersecurity privacy principles,cyber threats vulnerabilities,operational impacts cybersecurity lapses,application vulnerabilities,...,,,,,,,,,,
Cyber Legal Advisor,OV-LGA-001,"computer networking concepts protocols , netwo...",risk processes ( e,g,", methods assessing mitigating risk )","laws , regulations , policies , ethics relate ...",cybersecurity privacy principles,cyber threats vulnerabilities,operational impacts cybersecurity lapses,concepts practices processing digital forensic...,...,,,,,,,,,,
All Source Analyst,AN-ASA-001,"computer networking concepts protocols , netwo...",risk processes ( e,g,", methods assessing mitigating risk )","laws , regulations , policies , ethics relate ...",cybersecurity privacy principles,cyber threats vulnerabilities,operational impacts cybersecurity lapses,human computer interaction principles,...,,,,,,,,,,


In [52]:
syllabi_niccs_df =  pd.DataFrame.from_dict({(i,j): Syllabi_dict[i][j] 
                           for i in Syllabi_dict.keys() 
                           for j in Syllabi_dict[i].keys()},
                       orient='index')

In [53]:
syllabi_niccs_df.head(5)

Unnamed: 0,0
"(Network Operations Specialist, ID)",OM-NET-001
"(Network Operations Specialist, All)","[computer networking concepts protocols , netw..."
"(Network Operations Specialist, K)","[computer networking concepts protocols , netw..."
"(Network Operations Specialist, S)",[skill analyzing network traffic capacity perf...
"(Network Operations Specialist, A)","[operate network equipment hubs , routers , sw..."


In [None]:
################################################################################
################################################################################
################################################################################
# Certs

In [54]:
#working with the Certificate data from NICCS
path = r'C:\Users\15713\Desktop\Datasets\csv_xlsx\cert_list.xlsx' 
Certs = pd.read_excel(path) 

In [55]:
Certs.head(2)

Unnamed: 0,Cert Name,Cost,Time (hrs),Avg Salary of Recipient,Number of Holders,Knowledge,Skills,Abilities
0,CISSP (Certified Information Systems Security ...,699,40-50,122138.0,92976.0,K0001: Knowledge of computer networking concep...,S0011: Skill in conducting information searche...,A0002: Ability to match the appropriate knowle...
1,CISA (Certified Information Systems Auditor),595,112,132278.0,150000.0,K0001: Knowledge of computer networking concep...,S0027: Skill in determining how a security sys...,A0162: Ability to recognize the unique aspects...


In [None]:
cert_name = Certs['Cert Name'].to_list()
c_K = Certs['Knowledge'].to_list()
c_S = Certs['Skills'].to_list()
c_A = Certs['Abilities '].to_list()

In [None]:
# Saving each KSA under each Cert as a nested dictionary
Cert_dict = {}
c_dict = {}
pattern = '[S|A|K][0-9]+[\S][\s]' 
for i,cert in enumerate(cert_name):
    K,S,A,All= [],[],[], []
    if cert not in Cert_dict.keys(): # Avoiding duplicates
        k = 'None' if pd.isna(c_K[i]) else re.sub(pattern, '', c_K[i]).split("\n")
        s = 'None' if pd.isna(c_S[i]) else re.sub(pattern, '', c_S[i]).split("\n")
        a = 'None' if pd.isna(c_A[i]) else re.sub(pattern, '', c_A[i]).split("\n")
        
        for sentence in k:
            # Removing stopwords and reconstructing sentence
            K.append(remove_stopwords(sentence, avoid))
        for sentence in s:
            # Removing stopwords and reconstructing sentence
            S.append(remove_stopwords(sentence, avoid))
        for sentence in a:
            # Removing stopwords and reconstructing sentence
            A.append(remove_stopwords(sentence, avoid))
        All = K+S+A
        Cert_dict[cert] = {'All':All,
                           'K': K, 
                           'S': S, 
                           'A': A} 
        #Saving as a nested dict
        #for visualization purpose only:
        c_dict[cert] = All
# Exporting data
with open('Certs.json', 'w') as f:
    json.dump(Cert_dict, f)

In [None]:
# c_dict = pd.DataFrame()
# for k,v in Cert_dict.items():
#     if k != 'None':
#         df = pd.DataFrame({k: v})
#         c_dict = pd.concat([c_dict, df], axis=1)

# c_dict.T.head(5)

In [None]:
cert_niccs_df =  pd.DataFrame.from_dict({(i,j): Cert_dict[i][j] 
                           for i in Cert_dict.keys() 
                           for j in Cert_dict[i].keys()},
                       orient='index')

In [None]:
# Cert_dict['CISA (Certified Information Systems Auditor)']['A']

In [None]:
############## HERE I JUST WANTED TO TRY TO GROUP SIMILAR JOB-TITLES(JOB POSTINGS) AND JOB-ROLES(SYLLABI) TOGETHER
############## USING "THE MOST COMMON WORD" TECHNIQUE
# function that returns the number of common words in two sentences

def common_words_count(s1, s2):
    count = 0
    # Convert sentence to all lower case and split them into words
    try:
        l1 = s1.lower().split(' ') if s1==s1 else None
        l2 = s2.lower().split(' ') if s2==s2 else None

        #Lemmetization if necessary (but for now, i will try it without)

        if l1 is not None:
            for w1 in l1:
                if l2 is not None and w1 in l2:
                    count+=1
    except:
        pass
    return count



In [None]:
# Here I tried grouping similar jobs together just by looking at their titles
# using the common words they share. But as there are plenty of jobs that share
# similar names but have completely different KSAs, I didn't want to risk lumping different
# work roles together just because they shared a title. Therefore, We didn't follow up with this method
min_count = 4

similar = {}
others = []
for K in Master_indeed_dict.keys():
#     print(K)
#     print(sim)
    for k in WR:
        sim = []
        if k==k and K==K: # making sure that we don't have a nan 
            K_words = K.lower().split(' ')
            k_words = k.lower().split(' ')
            
                                 
            if len(K_words) > len(k_words):
                # assumption: We can not have more common words than the word count of the smallest job-title
                min_count = len(k_words) if min_count > len(k_words) else min_count
                for word in K_words:
                    if word in k_words:
                        sim.append(word)
            else:
                # assumption: We can not have more common words than the word count of the smallest job-title
                min_count = len(K_words) if min_count > len(K_words) else min_count
                for word in k_words:
                    if word in K_words:
                        sim.append(word)
            if len(sim) >= min_count:
                if k not in similar.keys():
                    similar[k] = []
                    similar[k].append(K)
                else:
                    similar[k].append(K)
            else: #if the job title doesn't fall under the 52 work roles
                others.append(K)
            
# Exporting dictionary as a json file for use during sentence vectorization
data = similar
with open('similar.json', 'w') as f:
    json.dump(data, f)       

In [None]:
# others
