# City of Los Angeles - Job Bulletins/Descriptions
- Helping the City of Los Angeles to Structure and Analyze its Job Descriptions Using NLTK.

In [161]:
# Libraries

# Install spaCy Library for Advanced Natural Language Processing in Python
!pip install spacy

In [225]:
# Dependencies & Setup

import os, glob, sys
import spacy
from spacy.matcher import PhraseMatcher

In [231]:
# Retrive Data

# Open All Files in Directory
all_files = os.listdir("data/Job Bulletins")
# print(all_files)

for files in all_files:
    if (files != '.ipynb_checkpoints'):
        try:
            f = open("data/Job Bulletins/" + files, "r")
            contents = f.read()
        except:
            break         
# print(contents)

In [228]:
for all_files in glob.glob('*.txt'):
    print(all_files)

In [260]:
# Get file names and store them
files = []
folder_path = "data/Job Bulletins"
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
    with open(filename, 'r') as f:
        x = 2
        try:
            text = f.read()
            files.append(filename)
        except:
            files.append('None')

# for file in files:
#     file = file[+19:]
#     print(file)

In [261]:
# Create NLP pipeline

# nlp = English()
nlp = spacy.load('en')

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner')
else:
    nlp.get_pipe('ner')

In [270]:
# Create known lables and their entities

label = 'COLDEG'
matcher = PhraseMatcher(nlp.vocab)
for i in ["bachelor's degree", "university", "four-year college"]:
    matcher.add(label, None, nlp(i))

In [208]:
# Define the offest function to turn string indexes into item indexes 

def offsetter(lbl, doc, matchitem):
    o_one = len(str(doc[0:matchitem[1]]))
    subdoc = doc[matchitem[1]:matchitem[2]]
    o_two = o_one + len(str(subdoc))
    return (o_one, o_two, lbl)

In [266]:
# Warning ⚠️: Will take a while if used on every file, recommend 
# using a few for testing.
# Create docs and entities to train the model with the labels created

res = []
to_train_ents = []

for file_name in files:
    if (file_name != 'None'):
        with open(f'{file_name}') as jb:
            line = True
            while line:
                line = jb.readline()
                mnlp_line = nlp(line)
                matches = matcher(mnlp_line)
                res = [offsetter(label, mnlp_line, x)
                      for x
                      in matches]
                to_train_ents.append((line, 
                                      dict(entities=res)))

In [213]:
# Warning ⚠️: This will use a lot of computer resources to run and will take a while, recommend 
# running on 1 epoch for testing. Although have on 20 for complete model.
# Train the model

optimizer = nlp.begin_training()

other_pipes = [pipe
              for pipe
              in nlp.pipe_names
              if pipe != 'ner']

# Epoch setting
epoch = 20
with nlp.disable_pipes(*other_pipes): # Only train NER
    for itn in range(epoch):
        losses = {}
        random.shuffle(to_train_ents)
        for item in to_train_ents:
            nlp.update([item[0]],
                       [item[1]],
                       sgd=optimizer,
                       drop=0.35,
                       losses=losses)
print(losses)

{'ner': 2.7992655596875395}


In [271]:
# Test label-matcher

one = nlp("In order to apply for this job you need at least one bachelor's degree or a four-year college.\
          and/or university")
matches = matcher(one)
[match for match in matches]

[(8070860030552988276, 12, 15),
 (8070860030552988276, 17, 21),
 (8070860030552988276, 24, 25)]

In [272]:
# Test built-in label and entity matcher

to_analyze = ("Hello Code & Supply, my name is Josh, and tonight we're in Pittsburgh.")
doc = nlp(to_analyze)
ents = [(x.text, x.label_)
       for x in doc.ents]
print(ents)

[('Josh', 'PERSON'), ('tonight', 'TIME'), ('Pittsburgh', 'GPE')]


In [212]:
# Test docs

to_train_ents = to_train_ents[:-1]
for item in to_train_ents:
            print([item[0]])

["This film contains strong language, They reckon that, when you're drowning,\n"]
['you see your life flash before you.\n']
["I didn't.\n"]
['HEAVY BREATHING THROUGHOU - Not like that, dickhead!\n']
['- Sorry.\n']
["- I've not got anything.\n"]
["- S'OK, it's a safe day.\n"]
['- Monday?\n']
['No, knobhead, me cycle!\n']
['HEAVY BREATHING CONTINUES\n']
["Can't you go on top?\n"]
["I've just had a double cheeseburger. I'd hurl. Just get on with it!\n"]
["Hurry up, or I'll miss me bus!\n"]
['Shit. I broke me nail!\n']
['SPLASHING\n']
['CHEERING\n']
['Look, just calm down, will you, Jackie?\n']
["Look, I can't get a word in edgeways here!\n"]
["This isn't what I had in mind, that's all.\n"]
['Oh, look, I feel stupid.\n']
['Not only do I feel stupid, I look stupid!\n']
['All the kids are just taking the piss.\n']
["It's all right for you, I'd like to see you walking around like this.\n"]
['Oi! Come here, you, you little shit!\n']
["Get here, you little shit! I'll have you when I get hold of

In [214]:
# Test model

from spacy import displacy
for item in to_train_ents:
    displacy.render(one, style='dep')

# TO-DO:
- Data Cleaning:
    - Get rid of all unnecessary spaces in docs.
- Data Parsing:
    -
- Data Examination:
    - What biases (positive/negative) are we trying to find? **Important
    - How will we optimize each doc? (i.e. Skills, Pay, etc.) 
    - Are there important entities we should establish? (i.e. college degree, certain responsibilites, etc.)
    - What text analyis could make these docs better?