#README.txt

Drag and drop a ZIPPED version of the folder `/data` from GitHub repo into Colab.

< - - - - -

In [1]:
# then run this and refresh directory...

# unzip datasets
!unzip data.zip

Archive:  data.zip
   creating: data/job-postings/
  inflating: data/job-postings/Data_Job_NY.csv  
  inflating: data/job-postings/Data_Job_SF.csv  
  inflating: data/job-postings/Data_Job_TX.csv  
  inflating: data/job-postings/Data_Job_WA.csv  
  inflating: data/README.txt         
   creating: data/resumes/
  inflating: data/resumes/kaggleResumes.csv  




---


# API

In [9]:
!pip install config

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting config
  Downloading config-0.5.1-py2.py3-none-any.whl (20 kB)
Installing collected packages: config
Successfully installed config-0.5.1


In [35]:
import requests
import os
import config
# API Docs found here: https://developer.usajobs.gov/Tutorials/Search-Jobs

In [38]:
config.US_JOBS_API_KEY = "xVc0TZiLfhcr17ci7Ngk6bLAetdRVFgntm2pZgWNtww="
config.EMAIL_ADDRESS = "gjacobthomas@gmail.com"

Note: I'm assuming this API has some limiter on it so we don't want to lose access. -tyler

In [39]:
host = 'data.usajobs.gov' 
# add these values in the config.py file
userAgent = config.EMAIL_ADDRESS
authKey = config.US_JOBS_API_KEY

base_url = "https://data.usajobs.gov/api/search"

parameters = {
    "JobCategoryCode": 2210,
    "Keyword": "Software Development",
    "LocationName": "Washington, DC"
}

headers = {
    "Host": host,          
    "User-Agent": userAgent,          
    "Authorization-Key": authKey  
}

resp = requests.request("GET", base_url,headers=headers, params=parameters)
result = resp.json()['SearchResult']['SearchResultItems']

# get Job Title 
print(result[1]['MatchedObjectDescriptor']['PositionTitle'])
# get Job Summary
print(result[1]['MatchedObjectDescriptor']['UserArea']['Details']['JobSummary'])
# more parameters are found here: https://developer.usajobs.gov/API-Reference/GET-api-Search

Supervisory IT Program Manager (APPSW)/Assistant Director, Development Services
This position is located in Criminal Division's Office of Administration, Information Technology Management (ITM) unit and serves as the Assistant Director, Development Services. The Assistant Director has responsibility for the management of internal and external software services, including custom application development, intranet support, SharePoint, software service providers, and all related cyber security functions.




---


# KeyBERT Extraction Function

In [None]:
!pip install keybert

In [30]:
# Imports
from keybert import KeyBERT # pip install keybert (give it a minute...)

'''
/*---------------------------------------------------------------------
 |  Method: extractKeywordsBERT
 |
 |  Purpose: Uses the KeyBert Keyword Extraction Tool to extract
 |           and return keywords from a given corpus. 
 |      
 |  Author: Tyler Parks
 |  Created On: 10/30/22
 |
 |  Parameters:
 |      normalized_corpus -- A single string containing all text of the
 |                           normalized corpus.
 |
 |  Returns: 
 |      keywords -- List of collected keywords
 |      scores -- List of those keyword's scores
 |
 |  References: https://maartengr.github.io/KeyBERT/#usage
 |
 *-------------------------------------------------------------------*/
''' 
def extractKeywordsBERT(normalized_corpus):   
    print('---KeyBert Extraction---')
    print('------------------------\n')

    # init. language model 
    language_model = KeyBERT(model = 'all-mpnet-base-v2')

    # extract those keywords!
    data = language_model.extract_keywords( normalized_corpus, 
                                            keyphrase_ngram_range=(1, 3), 
                                            stop_words='english',
                                            use_maxsum=False, 
                                            use_mmr=True,
                                            diversity=0.7,
                                            nr_candidates=20, 
                                            top_n=15
                                        )

    # zip the lists
    zipped = list(map(list, zip(*data)))
    keywords = zipped[0]
    scores = zipped[1]

    print('-Skill-'.ljust(40), '-Score-')
    for i, value in enumerate(keywords):
        print(value.ljust(40), scores[i])
    print()

    return keywords, scores



---


# CSO-Classifier Extraction Function

In [None]:
!python -m spacy download en_core_web_sm
!pip install cso-classifier

**Don't restart runtime if the terminal says so! Keep going.**

In [27]:
import nltk
nltk.download('stopwords')

import spacy
from cso_classifier import CSOClassifier      # import classifier tool

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [20]:
# update the most recent model
CSOClassifier.update() 

# define the model object
CSO_Extractor = CSOClassifier(modules = "both", enhancement = "first", explanation = False)


#     ONTOLOGY
The ontology is already up to date.

#     MODELS: CACHED & WORD2VEC
Updating the models: cached and word2vec
[██████████████████████████████████████████████████] 63M/63M
[*] Done!
[██████████████████████████████████████████████████] 349M/349M
[*] Done!
Models downloaded successfully.
Update completed.


In [31]:
def extractKeywordsCSO(normalized_corpus):

  # run the extraction
  result = CSO_Extractor.run(normalized_corpus)

  print('-----CSO Extraction-----')
  print('------------------------\n')
  
  for keyword in result['union']:
    print(keyword)

  return result['union']



---


#Driver Code

In [15]:
# Imports
import pandas as pd     # pip install pandas. usage: loading data from csv files into dataframes

In [33]:
### Helper Functions

# Function to retrieve text data.
# (either 1 or more job postings or resumes)?
def getFileData(filename, dir):
    return pd.read_csv('data/' + dir + '/' + filename)

# Function to normalize text data. 
# (some skill extraction tools will normalize text for us; however, if not, this function is here)
# includes removing stopwords, punctuation, dates, links, etc...
def normalizeCorpus(corpus):
    pass

    # for now
    return corpus

# Function to extract skill words from a given corpus.
# ideally, this function will output a set of skills extracted from the corpus
def extractSkills(corpus):

    # run KeyBert Extraction
    keywordsBERT, scoresBERT = extractKeywordsBERT(corpus)

    # run CSO Classifier Extraction
    keywordsCSO              = extractKeywordsCSO(corpus)
    
    # extraction method 2
    # extraction method n...
    # keep going!

### Driver Code
if __name__ == '__main__':

    # fetch the data
    job_posting_dataframe = getFileData('Data_Job_TX.csv'  , 'job-postings')
    resume_dataframe      = getFileData('kaggleResumes.csv', 'resumes'     )
    #---------------

    '''
    # print the dataframes
    print('DataFrame of Job Postings:')
    print(job_posting_dataframe)    
    print()

    print('DataFrame of Resumes:')
    print(resume_dataframe)
    print()
    #----------------------
    '''

    # fetch the job descriptions and resumes by themselves
    jpCorpus = list(job_posting_dataframe['Job_Desc'])
    rCorpus  = list(resume_dataframe['Resume'])
    #----------------------

    # Number of both job posting and resume samples to view
    NUM_SAMPLES = 0

    # for each JOB POSTING from the corpus
    i = 0
    for posting in jpCorpus:
        print('Job Posting #', i+1)
        print()

        text = normalizeCorpus(posting)
        extractSkills(text)

        # print lines, we are done with this posting
        print('------------------------\n')

        # break, after X postings
        i += 1
        if i > NUM_SAMPLES:
            break
    #---------------------- 

    # for each RESUME from the corpus
    i = 0
    for resume in rCorpus:
        print('Resume #', i+1)
        print()

        text = normalizeCorpus(resume)
        extractSkills(text)

        # print lines, we are done with this resume
        print('------------------------\n')

        # break, after X resumes
        i += 1
        if i > NUM_SAMPLES:
            break
    #---------------------- 

    
# keep going!

# end of driver code
#---------------

Job Posting # 1

---KeyBert Extraction---
------------------------

-Skill-                                  -Score-
senior data scientist                    0.6052
products alarm monitoring                0.3292
summary brinks home                      0.281
forest regression designing              0.2549
join team trusted                        0.2381
azure aws google                         0.2262
acquisition customer retention           0.2061
sentiment analysis gradient              0.177
efforts requirements                     0.1714
clearly communicate model                0.151
true                                     0.1165
libraries thorough understanding         0.114
excellent                                0.1049
america                                  0.0347
action true responsive                   0.0209

-----CSO Extraction-----
------------------------

computer science
sales
business intelligence
smart homes
machine learning
communication
engineers
engineering
rando