### Meta data for Jobs 

This notebook explores the jobs data to see how the texts can be cleaned, while also storing meta data to a file. This notebook should be converted to a script when running on the full data.

In [1]:
import zipfile

In [2]:
import xml.etree.cElementTree as ET
import os

In [3]:
from gensim.parsing.preprocessing import remove_stopwords, strip_numeric, strip_non_alphanum, stem_text

In [4]:
# 2018 US_XML_AddFeed_20180903_20180909.zip
# 2018 US_XML_AddFeed_20180604_20180610.zip
# 2018 US_XML_AddFeed_20181008_20181014.zip

In [5]:
folder = "2018"
filename = "US_XML_AddFeed_20181008_20181014.zip"

In [6]:
zfile_0 = zipfile.ZipFile("/project2/jevans/BG/Text_Data/" + folder + "/" +filename)

In [7]:
jobs = ET.parse(zfile_0.open(zfile_0.infolist()[0])).getroot()

In [8]:
jobs_texts_cities = {}
jobs_texts_orgs = {}

In [26]:
maybes = ["service", "customer", "process", "ensure", "care", "knowledge", "project",
         "client", "report", "plan", "relate", "assist", "learn", "function", "understand",
         "practice", "organization", "partner", "relationship", "family", "appropriate",
         "insurance"]

In [28]:
my_stop_words = ["hours", "job", "description", "employment", "work", "experience", "apply", "company",
 "skill", "include", "position", "year", "require", "team" ,"time", "ability", "skill", "provide",
 "include", "need", "opportunity", "support", "requirement", "work", "perform", "maintain", "year", 
 "location", "train", "duty", "responsibility", "application", "save", "prefer", "level", "benefit", 
  "qualification", "staff", "office", "able", "employer" 
   "follow", "strong", "member", "and/or", "status", "procedure" 
        "disability", "degree" "department", "base" , "applicant", "offer", "standard", "individual"
    "write", "complete", "career", "well", "candidate", "hire", 
     "equal", "license", "test", "look", "like", "right" ,"activity", "group", 
    "excellent", "date",  "performance", "national", "document",
     "role", "contact", "minimum", "task", "issue", "qualify", "great", "general",
    "travel", "goal", "essential", "direct", "email", "field", "request", "clean",
     "term", "life", "view", "rate", "pay", "certification", "join", "current", "federal" 
    "salary" ]

In [10]:
import spacy

In [11]:
nlp = spacy.load("en")

In [12]:
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [13]:
def clean_text(text, max_len=1500000):
    nlp.max_length = max_len
    if len(text) > max_len:
        text = text[0:max_len]
    doc = nlp(text.lower(), disable=["parser", "tagger", "ner"])
    clean_text = []
    for w in doc:
        if (not '\n' in w.text and not w.is_stop and not w.is_punct and not w.like_num and not '’' in w.text 
            and not ')' in w.text and not '\xa0' in w.text and not '\r' in w.text and not '\t' in w.text 
            and not '\v' in w.text and len(w.text) > 3 and "@" not in w.text and "https" not in w.text
            and w.lemma_ not in my_stop_words):
                clean_text.append(w.lemma_)
    return clean_text

In [14]:
for job in jobs:
    if job[4].text != 'USA' or job[7].text is None or job[7].text == "jtext dummybgt" or job[7].text == "None":
        continue
    text = clean_text(job[7].text.lower())                               
    city = job[3].text
    if city != "None" and city is not None:
        if city not in jobs_texts_cities:
            jobs_texts_cities[city] = []
            jobs_texts_cities[city].append(text)
        else:
            jobs_texts_cities[city].append(text)
    
    org = job[13].text
    if org != "None" and org is not None:
        if org not in jobs_texts_orgs:
            jobs_texts_orgs[org] = []
            jobs_texts_orgs[org].append(text)
        else:
            jobs_texts_orgs[org].append(text)

In [15]:
word_count = {}

In [16]:
for city in jobs_texts_cities:
    for text in jobs_texts_cities[city]:
        for word in text:
            if word not in word_count:
                word_count[word] = 0
            if word in word_count:
                word_count[word] += 1

In [17]:
import operator
sorted_words = sorted(word_count.items(), key=operator.itemgetter(1))

In [18]:
sorted_words.reverse()

In [20]:
for word_ in sorted_words[0:200]:
    word, _ = word_
    print(word, end=" ")

service require customer team time ability skill provide management business include need opportunity employee support requirement information sale work system perform maintain year process state program product ensure care knowledge project client report plan location train duty responsibility high environment manager application save prefer health review education level development benefit qualification patient relate type communication staff professional assist office datum able employer account follow medical quality schedule strong member area and/or post policy status procedure industry disability help site market degree manage department lead equipment technology meet responsible base applicant offer drive standard school individual write engineer associate build solution operation assign people design apply complete career job well store photo candidate hire learn function understand nurse equal license test look technical order like right develop veteran safety activity group 