## Import relevant modules

In [1]:
import os                       # module to interface with the underlying OS
import numpy as np              # linear algebra
import pandas as pd             # dataframe
import re                       # regular expression
import matplotlib.pyplot as plt # data visualization
%matplotlib inline
import toolkits as tk           # user-define module for efficiently reading files

## Get paths and names of files in each path

In [2]:
# # Path and list of jobs in Job Bulletins.
# # NOTE 1: These are raw data
# (raw_path, raw_jobs) = tk.get_raw_jobs() # tk is a user-define module

# Path and list of jobs in JobBulletins_cleaned
# NOTE 2: These are cleaned data
(cleaned_path, cleaned_jobs) = tk.get_cleaned_jobs()

In [3]:
len(cleaned_jobs)

682

In [4]:
# This is a helper function
def drivers_license_req(job):
    '''Returns DRIVERS_LICENSE_REQ(dlr)'''
    # Locate the information
    start = job.index('PROCESS NOTES') 
    temp  = job[start:]

    # Determine whether driver license is required for the job
    temp  = temp.split('\n')
    dlr   = []
    for possibly_contains_dlr in temp:
        if 'driver' in possibly_contains_dlr:
            DL_info = [e for e in possibly_contains_dlr.split('.') if len(e) > 3] # just some number
            for sentence in DL_info:
                if 'may' in sentence:
                    dlr.append('P')
                    break
                else:
                    dlr.append('R')
                    break
            break
    
    # Returns
    return dlr

In [97]:
# This is a helper function
def _get_dl_info(job):
    '''
    Helper function for DRIVERS_LICENSE_REQ and DRIV_LIC_TYPE fields. 
    Should not be used alone.
    '''
    # Locate the information
    start = job.index('PROCESS NOTES') 
    temp  = job[start:]

    # Determine if information about driver license is required for the job
    temp  = temp.split('\n')                                    # Ex: ['PROCESS NOTES', '', etc.]
    dl   = []                                                   # dl=information about driver's license
    for possibly_contains_dl in temp:
        if 'driver' in possibly_contains_dl:
            DL_info = ([e for e in 
                        possibly_contains_dl.split('.')         # Ex: 4., Some positions, etc.
                        if len(e) > 3])                         # split at period and kill itemizers
            for sentence in DL_info:
                if 'may' in sentence:                           # 'may' is a clear indicator for not required
                    dl.append(('P', np.nan))
                    break
                else:                                           # else, DL is required
                    start = possibly_contains_dl.index('^^^') + len('^^^')
                    end   = possibly_contains_dl.index('@@@')
                    info  = possibly_contains_dl[start:end]
                    if len(info) > 2:                           # ' ' has length 1. put 2 just to make sure
                        dl.append(('R', info))                  # Ex: ' A ', already 3 characters
                    else:
                        dl.append(('R', np.nan))
                    break
            break
    
    dl = pd.DataFrame(data=dl, columns=['DRIVERS_LICENSE_REQ', 'DRIV_LIC_TYPE'])
    
    # Returns
    return dl

In [99]:
# Get a job as an example
i = cleaned_jobs.index('APPARATUS OPERATOR 2121 071417 (1).txt')
job_path = cleaned_path + cleaned_jobs[i]
sa  = open(job_path, 'rt').read()

# Call the helper function on a job first
baseline = _get_dl_info(job=sa)
baseline

Unnamed: 0,DRIVERS_LICENSE_REQ,DRIV_LIC_TYPE
0,R,B|out of state commercial comparable


In [35]:
np.nan

nan

In [100]:
def drivers_license_req():
    '''Returns DRIVERS_LICENSE_REQ (dlr)'''
    dlr = list(baseline['DRIVERS_LICENSE_REQ'])
    return dlr

# Test
drivers_license_req()

['R']

In [102]:
def driv_lic_type():
    '''Returns DRIV_LIC_TYPE (dlt)'''
    dlt = list(baseline['DRIV_LIC_TYPE'])
    return dlt

# Test
driv_lic_type()

[' B|out of state commercial comparable ']

In [90]:
sa[sa.index('^^^')+len('^^^'):sa.index('@@@')]

' '

In [70]:
sa.index('^^^')+len('^^^')

2363

In [74]:
sa[2363]

' '

In [69]:
sa.index('@@@')

2364

In [91]:
_get_dl_info(job=sa)

[('R', nan)]

In [92]:
pd.DataFrame(_get_dl_info(job=sa))

Unnamed: 0,0,1
0,R,


In [88]:
pd.DataFrame([(1, 2, None)])

Unnamed: 0,0,1,2
0,1,2,


In [54]:
sa

'MARINE ENVIRONMENTAL MANAGER\n\nClass Code:       9437\nOpen Date:  06-06-14\n(\n\nANNUAL SALARY\n\n$110,371 to 137,139 and $122,690 to $152,444\nCandidates from the eligible list are normally appointed to vacancies in the lower pay grade positions.\nThe salary in the Department of Water and Power is\n\nNOTES:\n\nNOTES\n\nDUTIES\n\nA Marine Environmental Manager directs or assists in directing the activities of the Environmental Management Division of the Harbor Department; directs and manages the development and implementation of specialized environmental policies and programs through a professional environmental staff; and advises management of issues of environmental policy; and does related work.\n\nREQUIREMENTS/MINIMUM QUALIFICATIONS\n\n1. Eighteen months of full-time paid professional experience in a class at least at the level of either Marine Environmental Supervisor or Environmental Supervisor supervising a professional staff performing environmental activities.\n\nPROCESS NO

In [45]:
u = pd.DataFrame([1, None , 2])
u

Unnamed: 0,0
0,1.0
1,
2,2.0


In [46]:
u.isnull().sum()

0    1
dtype: int64

In [81]:
# Second, make sure the word 'REQUIREMENTS/MINIMUM QUALIFICATIONS' can be found in the job postings.
# Do an index() in the try clause, not find(), to catch the errors
k = []
for file_name in cleaned_jobs:
    job_path = cleaned_path + file_name        # define path to file_name
    cleaned_job  = open(job_path, 'rt').read() # read in job as a string
    try:
        print(_get_dl_info(job=cleaned_job))
    except:                                # do some pretty printings here to help our eyes from pain
        ## define some useful variables
        border_line = '##############################################################################################'
        how_many    = int((len(border_line) - len(job_path))/2)
        print(border_line)
        ## do pretty printings
        print('#'*how_many + job_path + '#'*how_many)
        print(border_line)

[]
[('P', None)]
[]
[]
[('P', None)]
[('P', None)]
[('P', None)]
[]
[('R', None)]
[('R', None)]
[]
[]
[('R', None)]
[('P', None)]
[('R', None)]
[('P', None)]
[('P', None)]
[('R', None)]
[('R', None)]
[('R', None)]
[('R', None)]
[('R', None)]
[('P', None)]
[('P', None)]
[('P', None)]
[('R', None)]
[('R', None)]
[('R', None)]
[('R', None)]
[('P', None)]
[('R', ' B|out of state commercial comparable ')]
[]
[('R', None)]
[('R', None)]
[]
[('P', None)]
[('R', None)]
[('P', None)]
[('P', None)]
[('R', None)]
[('R', None)]
[('P', None)]
[('P', None)]
[]
[('P', None)]
[('R', None)]
[('P', None)]
[('P', None)]
[('P', None)]
[('P', None)]
[('R', None)]
[]
[('R', None)]
[('R', None)]
[('R', None)]
[('P', None)]
[('P', None)]
[('P', None)]
[]
[('P', None)]
[('R', None)]
[('P', None)]
[('R', None)]
[]
[('P', None)]
[('P', None)]
[('P', None)]
[('P', None)]
[('P', None)]
[('P', None)]
[('P', None)]
[('P', None)]
[('P', None)]
[('P', None)]
[('R', None)]
[('R', None)]
[]
[('P', None)]
[('R', None)]
[

In [8]:
'Driver' in 'driver right'

False

In [33]:
drivers_license_req(job=sa)

['P']

In [58]:
i = cleaned_jobs.index('AIR CONDITIONING MECHANIC 3774 041417.txt')
job_path = cleaned_path + cleaned_jobs[i]
sa  = open(job_path, 'rt').read()

In [31]:
temp = sa[sa.index('PROCESS NOTES'):]
temp

'PROCESS NOTES\n\n1. In addition to the regular City application form, each applicant is required to complete the Marine Environmental Manager Qualifications Questionnaire at the time of filing. The Marine Environmental Manager Qualifications Questionnaire is located within the Supplemental Question Section of the City application. Applicants who fail to complete the Qualifications Questionnaire will not be considered further in this examination, and their application will not be processed. \n2. Applicants who lack six months or less of the required experience may file for this examination. However, they cannot be appointed until the full experience requirement is met.\n3. For qualifying work experience gained outside of the City of Los Angeles, the term "professional experience" applies to positions that require possession of a degree from a recognized four-year college or university in order to obtain that position. Therefore, to be considered "professional", non-City qualifying expe

In [32]:
temp = temp.split('\n')
temp

['PROCESS NOTES',
 '',
 '1. In addition to the regular City application form, each applicant is required to complete the Marine Environmental Manager Qualifications Questionnaire at the time of filing. The Marine Environmental Manager Qualifications Questionnaire is located within the Supplemental Question Section of the City application. Applicants who fail to complete the Qualifications Questionnaire will not be considered further in this examination, and their application will not be processed. ',
 '2. Applicants who lack six months or less of the required experience may file for this examination. However, they cannot be appointed until the full experience requirement is met.',
 '3. For qualifying work experience gained outside of the City of Los Angeles, the term "professional experience" applies to positions that require possession of a degree from a recognized four-year college or university in order to obtain that position. Therefore, to be considered "professional", non-City qu

In [28]:
for i in temp:
    if 'driver' in i:
        print(i)

6. Some positions may require a valid California driver's license. Some positions require incumbents to perform field audits and to provide their own transportation. Employees in these positions must possess a valid California driver's license. Candidates for such positions may not be eligible for appointment to these positions if their record within the last 36 months reflects three or more moving violations and/or at-fault accidents, or a conviction of a major moving violation such as driving under the influence. In those situations where the employee's private vehicle is to be used, the vehicle shall be properly insured for use in City service upon appointment. Mileage will be paid in accordance with established policies.


In [62]:
# Define a function
def drivers_license_req(job):
    '''Returns DRIVERS_LICENSE_REQ(dlr)'''
    # Locate the information
    temp = job[job.find('PROCESS NOTES'):]

    # Determine whether driver license is required for the job
    # The approach taken here is to look at each sentence based on their index in temp.
    temp = temp.split('\n')
    dlr = []
    for idx in range(len(temp)):
        if 'driver' in temp[idx]:
            splitted_sentence = temp[idx].split()
            if 'required' in splitted_sentence:
                dlr.append('R')
            elif 'require' in splitted_sentence:
                dlr.append('P')
            break
    
    # Returns
    return dlr

# Test
drivers_license_req(job=sa)

['R']

In [63]:
dlr = []
for idx in range(len(temp)):
    if 'driver' in temp[idx]:
        splitted_sentence = temp[idx].split()
        print(splitted_sentence)

['1.', 'A', 'valid', 'California', 'Class', 'B', "driver's", 'license', 'and', 'valid', 'medical', 'certificate', 'approved', 'by', 'the', 'State', 'of', 'California', 'Department', 'of', 'Motor', 'Vehicles', 'are', 'required', 'for', 'all', 'positions', 'either', 'prior', 'to', 'the', 'appointment', 'or', 'prior', 'to', 'the', 'end', 'of', 'probation.', 'Some', 'positions', 'may', 'require', 'the', 'license', 'and', 'certificate', 'prior', 'to', 'the', 'appointment,', 'while', 'other', 'positions', 'may', 'require', 'the', 'license', 'and', 'certificate', 'to', 'be', 'obtained', 'prior', 'to', 'completion', 'of', 'the', 'six-month', 'probation', 'period.', 'Candidates', 'may', 'not', 'be', 'eligible', 'for', 'appointment', 'to', 'these', 'positions', 'if', 'their', 'record', 'within', 'the', 'last', '36', 'months', 'reflects', 'three', 'or', 'more', 'moving', 'violations', 'and/or', 'at-fault', 'accidents,', 'or', 'a', 'conviction', 'of', 'a', 'major', 'moving', 'violation', '(such', 

In [68]:
# Second, make sure the word 'REQUIREMENTS/MINIMUM QUALIFICATIONS' can be found in the job postings.
# Do an index() in the try clause, not find(), to catch the errors
k = []
for file_name in cleaned_jobs:
    job_path = cleaned_path + file_name        # define path to file_name
    cleaned_job  = open(job_path, 'rt').read() # read in job as a string
    try:
        if len(drivers_license_req(job=cleaned_job)) > 0:
            if (drivers_license_req(job=cleaned_job))[0]=='R':
                print(job_path)
    except:                                # do some pretty printings here to help our eyes from pain
        ## define some useful variables
        border_line = '##############################################################################################'
        how_many    = int((len(border_line) - len(job_path))/2)
        print(border_line)
        ## do pretty printings
        print('#'*how_many + job_path + '#'*how_many)
        print(border_line)

CityofLA/JobBulletins_cleaned/ACCOUNTANT 1513 062218.txt
CityofLA/JobBulletins_cleaned/ADMINISTRATIVE ANALYST 1590 060118.txt
CityofLA/JobBulletins_cleaned/ADMINISTRATIVE CLERK 1358 033018 (2).txt
CityofLA/JobBulletins_cleaned/ADMINISTRATIVE HEARING EXAMINER 9135 100915.txt
CityofLA/JobBulletins_cleaned/AIRPORT GUIDE 0845 042018.txt
CityofLA/JobBulletins_cleaned/AIRPORT LABOR RELATIONS ADVOCATE 9210 020119.txt
CityofLA/JobBulletins_cleaned/AIRPORT MANAGER 7260 120216.txt
CityofLA/JobBulletins_cleaned/AIRPORTS MAINTENANCE SUPERINTENDENT 3331 021518.txt
CityofLA/JobBulletins_cleaned/AIRPORTS MAINTENANCE SUPERVISOR 3336 111618.txt
CityofLA/JobBulletins_cleaned/AIRPORTS PUBLIC AND COMMUNITY RELATIONS DIRECTOR 1788 120817.txt
CityofLA/JobBulletins_cleaned/ANIMAL KEEPER 4304 083118.txt
CityofLA/JobBulletins_cleaned/AQUARIUM EDUCATOR 2493 010816.txt
CityofLA/JobBulletins_cleaned/AQUATIC FACILITY MANAGER 2423 052915 REVISED 060915.txt
CityofLA/JobBulletins_cleaned/AQUEDUCT AND RESERVOIR KEEPER

In [36]:
'driver' in temp[2]

True

In [39]:
'require' in temp[2].split()

True

In [40]:
temp[2].split()

['1.',
 'A',
 'valid',
 'California',
 'Class',
 'B',
 "driver's",
 'license',
 'and',
 'valid',
 'medical',
 'certificate',
 'approved',
 'by',
 'the',
 'State',
 'of',
 'California',
 'Department',
 'of',
 'Motor',
 'Vehicles',
 'are',
 'required',
 'for',
 'all',
 'positions',
 'either',
 'prior',
 'to',
 'the',
 'appointment',
 'or',
 'prior',
 'to',
 'the',
 'end',
 'of',
 'probation.',
 'Some',
 'positions',
 'may',
 'require',
 'the',
 'license',
 'and',
 'certificate',
 'prior',
 'to',
 'the',
 'appointment,',
 'while',
 'other',
 'positions',
 'may',
 'require',
 'the',
 'license',
 'and',
 'certificate',
 'to',
 'be',
 'obtained',
 'prior',
 'to',
 'completion',
 'of',
 'the',
 'six-month',
 'probation',
 'period.',
 'Candidates',
 'may',
 'not',
 'be',
 'eligible',
 'for',
 'appointment',
 'to',
 'these',
 'positions',
 'if',
 'their',
 'record',
 'within',
 'the',
 'last',
 '36',
 'months',
 'reflects',
 'three',
 'or',
 'more',
 'moving',
 'violations',
 'and/or',
 'at-fau

In [43]:
'required' in temp[2]

True

In [44]:
# Define a function
def drivers_license_req(job):
    '''Returns DRIVERS_LICENSE_REQ(dlr)'''
    # Locate the information
    temp = sa[sa.find('PROCESS NOTES'):]

    # Determine whether driver license is required for the job
    # The approach taken here is to look at each sentence based on their index in temp.
    temp = temp.split('\n')
    dlr = []
    for idx in range(len(temp)):
        if 'driver' in temp[idx]:
            splitted_sentence = temp[idx].split()
            if 'required' in splitted_sentence:
                dlr.append('R')
            elif 'require' in splitted_sentence:
                dlr.append('P')
            break
    
    # Returns
    return dlr

# Test
drivers_license_req(job=sa)

['R']

In [46]:
# Second, make sure the word 'REQUIREMENTS/MINIMUM QUALIFICATIONS' can be found in the job postings.
# Do an index() in the try clause, not find(), to catch the errors
k = []
for file_name in cleaned_jobs:
    job_path = cleaned_path + file_name        # define path to file_name
    cleaned_job  = open(job_path, 'rt').read() # read in job as a string
    try:
        print(job_path, drivers_license_req(job=cleaned_job))
    except:                                # do some pretty printings here to help our eyes from pain
        ## define some useful variables
        border_line = '##############################################################################################'
        how_many    = int((len(border_line) - len(job_path))/2)
        print(border_line)
        ## do pretty printings
        print('#'*how_many + job_path + '#'*how_many)
        print(border_line)

CityofLA/JobBulletins_cleaned/311 DIRECTOR  9206 041814.txt ['R']
CityofLA/JobBulletins_cleaned/ACCOUNTANT 1513 062218.txt ['R']
CityofLA/JobBulletins_cleaned/ACCOUNTING CLERK 1223 071318.txt ['R']
CityofLA/JobBulletins_cleaned/ACCOUNTING RECORDS SUPERVISOR 1119 072718.txt ['R']
CityofLA/JobBulletins_cleaned/ADMINISTRATIVE ANALYST 1590 060118.txt ['R']
CityofLA/JobBulletins_cleaned/ADMINISTRATIVE CLERK 1358 033018 (2).txt ['R']
CityofLA/JobBulletins_cleaned/ADMINISTRATIVE HEARING EXAMINER 9135 100915.txt ['R']
CityofLA/JobBulletins_cleaned/ADVANCE PRACTICE PROVIDER CORRECTIONAL CARE 2325 020808 REV 111214.txt ['R']
CityofLA/JobBulletins_cleaned/AIR CONDITIONING MECHANIC 3774 041417.txt ['R']
CityofLA/JobBulletins_cleaned/AIR CONDITIONING MECHANIC SUPERVISOR 3781 111618 2.txt ['R']
CityofLA/JobBulletins_cleaned/AIRPORT AIDE 1540 081018.txt ['R']
CityofLA/JobBulletins_cleaned/AIRPORT CHIEF INFORMATION SECURITY OFFICER 1404 120415_Modified.txt ['R']
CityofLA/JobBulletins_cleaned/AIRPORT E

CityofLA/JobBulletins_cleaned/PORT POLICE CAPTAIN 3224 110416.txt ['R']
CityofLA/JobBulletins_cleaned/PORT POLICE LIEUTENANT 3223 120916.txt ['R']
CityofLA/JobBulletins_cleaned/PORT POLICE OFFICER 3221 110906 Rev 060115.txt ['R']
CityofLA/JobBulletins_cleaned/PORT POLICE SERGEANT 3222 121616.txt ['R']
CityofLA/JobBulletins_cleaned/PORTFOLIO MANAGER 9143 082517.txt ['R']
CityofLA/JobBulletins_cleaned/POWER ENGINEERING MANAGER 9453 042817 (1).txt ['R']
CityofLA/JobBulletins_cleaned/POWER SHOVEL OPERATOR 3558 062416.txt ['R']
CityofLA/JobBulletins_cleaned/PRE-PRESS OPERATOR 1481 072817 (4).txt ['R']
CityofLA/JobBulletins_cleaned/PRINCIPAL ACCOUNTANT 1525 121517.txt ['R']
CityofLA/JobBulletins_cleaned/PRINCIPAL ANIMAL KEEPER 4312 070618.txt ['R']
CityofLA/JobBulletins_cleaned/PRINCIPAL CITY PLANNER 7946 030218.txt ['R']
CityofLA/JobBulletins_cleaned/PRINCIPAL CIVIL ENGINEER 9489 022318.txt ['R']
CityofLA/JobBulletins_cleaned/PRINCIPAL CIVIL ENGINEERING DRAFTING TECHNICIAN 7219 110218.txt [

CityofLA/JobBulletins_cleaned/WATER UTILITY WORKER 3912 120817.txt ['R']
CityofLA/JobBulletins_cleaned/WATERSHED RESOURCES SPECIALIST  7862 080516 (1).txt ['R']
CityofLA/JobBulletins_cleaned/WATERWORKS ENGINEER 7248 071516 (1).txt ['R']
CityofLA/JobBulletins_cleaned/WATERWORKS MECHANIC SUPERVISOR 3987 051614 (1).txt ['R']
CityofLA/JobBulletins_cleaned/WELDER 3796 102816.txt ['R']
CityofLA/JobBulletins_cleaned/WELDER SUPERVISOR 3798 120817.txt ['R']
CityofLA/JobBulletins_cleaned/WHARFINGER 1190 092118.txt ['R']
CityofLA/JobBulletins_cleaned/WINDOW CLEANER 3173 032417.txt ['R']
CityofLA/JobBulletins_cleaned/WORKERS_ COMPENSATION ANALYST 1774 032417R.txt ['R']
CityofLA/JobBulletins_cleaned/WORKERS_ COMPENSATION CLAIMS ASSISTANT 1775 041114.txt ['R']
CityofLA/JobBulletins_cleaned/X-RAY AND LABORATORY TECHNICIAN 2358 012916.txt ['R']
CityofLA/JobBulletins_cleaned/ZOO CURATOR 4297 040816.txt ['R']
CityofLA/JobBulletins_cleaned/ZOO CURATOR OF EDUCATION 4300 091418.txt ['R']
CityofLA/JobBullet

In [8]:
# Define a function
def job_duties(job):
    '''Returns JOB_DUTIES (jd)'''
    # Locate the relevant information: from DUTIES to REQUIREMENTS/MINIMUM QUALIFICATIONS
    start = 'DUTIES'; end = 'REQUIREMENTS/MINIMUM QUALIFICATIONS'
    temp  = job[job.index(start):job.index(end)]
    # Split at white space and ignore the 0th element (the word 'DUTIES')
    temp = temp.split()[1:]
    # Join with white space to get the required format
    jd = ' '.join(temp)
    
    # Returns
    return jd

In [9]:
job_duties(cleaned_job)

"A Zoo Registrar assists in the development and implementation of the Zoo's collection management policy and the resulting collection plans. The responsibilities include managing the Zoo's animal records and related policies, validating the quality of recorded data, and ensuring the legal compliance of all animal transactions. The Zoo Registrar will serve as a Zoo's liaison to relevant government agencies, other zoological institutions, and conservation organizations, including the Association of Zoos and Aquariums (AZA)."

In [4]:
# k = []
# for file_name in cleaned_jobs:
#     job_path = cleaned_path + file_name        # define path to file_name
#     cleaned_job  = open(job_path, 'rt').read() # read in job as a string
#     try:
#         print(job_duties(cleaned_job))
#         #print()
#     except:
#         provenir.catch_bad_jobs(job_path)

In [3]:
mtk.jd_print_results(job_path=cleaned_path, job_type=cleaned_jobs)

A 311 Director is responsible for the successful operation and expansion of the 311 Call Center in the Information Technology Agency (ITA) which answers call from constituents regarding Citywide services provided by City departments; works to ensure the efficient and effective resolution of any issues that may arise; plans, directs, hires, coaches, and coordinates a large staff of professional, technical and clerical employees engaged in the implementation, administration, and operations of the City's 311 Call Center; applies sound supervisor principles and techniques in building and maintaining and effective work force; fulfills equal opportunity responsibilities; and does related work.

An Accountant does professional accounting work in the analysis, preparation, maintenance, control, and reconciliation of financial records and reports in accordance with the principles of governmental and public accounting.

An Accounting Clerk performs difficult and responsible clerical accounting w

In [3]:
def jd_get_one(job):
    '''Returns the field JOB_DUTIES (jd) as a string'''
    # Locate the relevant information: from the word DUTIES to the phrase REQUIREMENTS/MINIMUM QUALIFICATIONS
    start = 'DUTIES'; end = 'REQUIREMENTS/MINIMUM QUALIFICATIONS'
    temp  = job[job.index(start):job.index(end)]
    # Split at white space and ignore the 0th element (the word 'DUTIES')
    temp = temp.split()[1:]
    # Join with white space to get the required format
    jd = ' '.join(temp)
    
    # Returns
    return jd

In [4]:
def jd_print_results(job_path, job_type):
    '''
    Prints out results when applying itk.jd_get_one function to job postings.
    job_path can only be either raw_path or cleaned_path.
    job_type can only be either raw_jobs or cleaned_jobs.
    '''
    provenir.print_results(path=job_path, files=job_type, fn=itk.jd_get_one)

In [5]:
def _jd_checkpoint1(print_option=False):
    '''Check if the word DUTIES is in each job'''
    # The idea is that if length of nopass is more than 1,
    # then this checkpoint has not been passed.
    nopass = []
    for file_name in cleaned_jobs:
        job_path    = cleaned_path + file_name
        cleaned_job = open(job_path, 'rt').read()
        try:
            if 'DUTIES' not in cleaned_job.split():
                nopass.append(job_path)
        except:
            ## If this part is executed, there is problem with itk.jct_get_one
            print('Jobs surrounded by # fail at unit level.' + 
                  'Inspect each of them with itk.jd_get_one()')
            provenir.catch_bad_jobs(job_path)
            return
    
    # Print out which job fails at this checkpoint
    if print_option:
        for e in nopass:
            print(e)
    
    # Returns
    return len(nopass)

In [6]:
def _jd_checkpoint1(print_option=False):
    '''Check if the word DUTIES is in each job'''
    # The idea is that if length of nopass is more than 1,
    # then this checkpoint has not been passed.
    nopass = []
    for file_name in cleaned_jobs:
        job_path    = cleaned_path + file_name
        cleaned_job = open(job_path, 'rt').read()
        try:
            if 'DUTIES' not in cleaned_job.split():
                nopass.append(job_path)
        except:
            ## If this part is executed, there is problem with itk.jct_get_one
            print('Jobs surrounded by # fail at unit level.' + 
                  'Inspect each of them with itk.jd_get_one()')
            provenir.catch_bad_jobs(job_path)
            return
    
    # Print out which job fails at this checkpoint
    if print_option:
        for e in nopass:
            print(e)
    
    # Returns
    return len(nopass)

In [7]:
def _jd_checkpoint2(print_option=False):
    '''Check if the phrase REQUIREMENTS/MINIMUM QUALIFICATIONS is in each job'''
    # The idea is that if length of nopass is more than 1,
    # then this checkpoint has not been passed.
    nopass = []
    for file_name in cleaned_jobs:
        job_path    = cleaned_path + file_name
        cleaned_job = open(job_path, 'rt').read()
        try:
            if 'REQUIREMENTS/MINIMUM QUALIFICATIONS' not in cleaned_job.split('\n'):
                nopass.append(job_path)
        except:
            ## If this part is executed, there is problem with itk.jct_get_one
            print('Jobs surrounded by # fail at unit level.' + 
                  'Inspect each of them with itk.jd_get_one()')
            provenir.catch_bad_jobs(job_path)
            return
    
    # Print out which job fails at this checkpoint
    if print_option:
        for e in nopass:
            print(e)
    
    # Returns
    return len(nopass)

In [11]:
def _jd_checkpoint3(print_option=False):
    '''Checks if no other sections exist in jd, e.g. NOTES:, NOTE, SPECIAL, VACANCY'''
    # The idea is that if length of nopass is more than 1,
    # then this checkpoint has not been passed.
    nopass = []
    for file_name in cleaned_jobs:
        job_path    = cleaned_path + file_name
        cleaned_job = open(job_path, 'rt').read()
        try:
            jd = itk.jd_get_one(cleaned_job)
            if ('NOTE' in jd) or ('SPECIAL' in jd) or ('VACANCY' in jd):
                nopass.append(job_path)
        except:
            ## If this part is executed, there is problem with itk.jct_get_one
            print('Jobs surrounded by # fail at unit level.' + 
                  'Inspect each of them with itk.jd_get_one()')
            provenir.catch_bad_jobs(job_path)
            return
    
    # Print out which job fails at this checkpoint
    if print_option:
        for e in nopass:
            print(e)
    
    # Returns
    return len(nopass)

In [8]:
mtk._jd_checkpoint1()

0

In [9]:
mtk._jd_checkpoint2()

0

In [12]:
_jd_checkpoint3()

0

In [17]:
j = provenir.spotlight(job_name='311 DIRECTOR  9206 041814.txt', job_path=mtk.cleaned_path, job_type=mtk.cleaned_jobs)
j

'311 DIRECTOR\nClass Code:       9206\nOpen Date:  04-18-14\n(\nANNUAL SALARY \n\n$125,175 to $155,514\nThe salary in the Department of Water and Power is\n\nNOTES:\n\nNOTES\n\nDUTIES\n\nA 311 Director is responsible for the successful operation and expansion of the 311 Call Center in the Information Technology Agency (ITA) which answers call from constituents regarding Citywide services provided by City departments; works to ensure the efficient and effective resolution of any issues that may arise; plans, directs, hires, coaches, and coordinates a large staff of professional, technical and clerical employees engaged in the implementation, administration, and operations of the City\'s 311 Call Center; applies sound supervisor principles and techniques in building and maintaining and effective work force; fulfills equal opportunity responsibilities; and does related work.\n\n\nREQUIREMENTS/MINIMUM QUALIFICATIONS\n\n1. One year of full-time paid experience as a Senior Management Analyst

In [18]:
itk.jd_get_one(j)

"A 311 Director is responsible for the successful operation and expansion of the 311 Call Center in the Information Technology Agency (ITA) which answers call from constituents regarding Citywide services provided by City departments; works to ensure the efficient and effective resolution of any issues that may arise; plans, directs, hires, coaches, and coordinates a large staff of professional, technical and clerical employees engaged in the implementation, administration, and operations of the City's 311 Call Center; applies sound supervisor principles and techniques in building and maintaining and effective work force; fulfills equal opportunity responsibilities; and does related work."

In [22]:
def jd_get_many():
    '''Returns JOB_DUTIES field for all jobs as a dataframe'''
    # Make sure checkpoints have been passed
    assert _jd_checkpoint1()==1 # hey
    assert _jd_checkpoint2()==0
    assert _jd_checkpoint3()==0
    
    # Return an n-by-1 dataframe for job title
    r = []
    for file_name in cleaned_jobs:
        job_path    = cleaned_path + file_name
        cleaned_job = open(job_path, 'rt').read()
        jd          = itk.jd_get_one(cleaned_job)
        r.append(jd)
        
    df = pd.DataFrame(data=r, columns=['JOB_DUTIES'])
    return df

In [1]:
import provenir
import individual_toolkit as itk
import multiple_toolkit as mtk
import pandas as pd

In [2]:
# Get path and list of jobs in Job Bulletins.
# These are raw data
(raw_path, raw_jobs) = provenir.get_raw_jobs()

# Get path and list of jobs in JobBulletins_cleaned
# These are cleaned data
(cleaned_path, cleaned_jobs) = provenir.get_cleaned_jobs()

In [3]:
u1 = mtk.jct_get_many()
u2 = mtk.jcn_get_many()
u3 = mtk.jd_get_many()
u4 = mtk.od_get_many()

In [8]:
# k = pd.concat([u1,u2,u3, u4], axis=1)
# k

In [20]:
def dlr_print_results(job_path, job_type):
    '''
    Prints out results when applying itk.dlr_get_one function to job postings.
    job_path can only be either raw_path or cleaned_path.
    job_type can only be either raw_jobs or cleaned_jobs.
    '''
    provenir.print_results(path=job_path, files=job_type, fn=itk.dlr_get_one)

In [21]:
# dlr_print_results(job_path=raw_path, job_type=raw_jobs)

In [22]:
def _dlr_checkpoint1():
    ''''''
    txt = ('Manually check DRIVERS_LICENSE_REQ field to ensure accuracy. ' + 
           'Enter Yes to confirm: ')
    user_input = input(txt)
    
    # Returns
    if user_input.lower()[0]=='y':
        return 0
    else:
        return 1

In [26]:
def dlr_get_many():
    ''''''
    assert _dlr_checkpoint1()==0
    
    

In [27]:
dlr_get_many()

Manually check DRIVERS_LICENSE_REQ field to ensure accuracy. Enter Yes to confirm: y


In [31]:
def dlr_get_many():
    '''Returns DRIVERS_LICENSE_REQ field for all jobs as a dataframe'''
    # Make sure checkpoints have been passed
    assert _dlr_checkpoint1()==0
    
    # Return an n-by-1 dataframe for job title
    r = []
    for file_name in cleaned_jobs:
        job_path    = cleaned_path + file_name
        cleaned_job = open(job_path, 'rt').read()
        dlr         = itk.dlr_get_one(cleaned_job)
        r.append(dlr)
        
    df = pd.DataFrame(data=r, columns=['DRIVERS_LICENSE_REQ'])
    return df

In [44]:
r = []
for file_name in cleaned_jobs:
    try:
        job_path    = cleaned_path + file_name
        cleaned_job = open(job_path, 'rt').read()
        dlr         = itk.dlr_get_one(cleaned_job)
        r.append(dlr)
    except:
        print(job_path)
        break

CityofLA/JobBulletins_cleaned/ACCOUNTANT 1513 062218.txt


In [46]:
dlr = itk.dlr_get_one(j)

AssertionError: 2 columns passed, passed data had 1 columns

In [47]:
itk._get_dl_info(j)

AssertionError: 2 columns passed, passed data had 1 columns

In [32]:
dlr_get_many()

Manually check DRIVERS_LICENSE_REQ field to ensure accuracy. Enter Yes to confirm: y


AssertionError: 2 columns passed, passed data had 1 columns

In [None]:
def _dlr_checkpoint2(print_option=False):
    ''''''
    # The idea is that if length of nopass is more than 1,
    # then this checkpoint has not been passed.
    nopass = []
    for file_name in cleaned_jobs:
        job_path    = cleaned_path + file_name
        cleaned_job = open(job_path, 'rt').read()
        try:
            info = itk.dlr_get_one(cleaned_job)
            if len(info) > 0:
                if info[0]=='R':
                    if '^^^' not 
        except:
            ## If this part is executed, there is problem with itk.jct_get_one
            print('Jobs surrounded by # fail at unit level.' + 
                  'Inspect each of them with itk.jd_get_one()')
            provenir.catch_bad_jobs(job_path)
            return
    
    # Print out which job fails at this checkpoint
    if print_option:
        for e in nopass:
            print(e)
    
    # Returns
    return len(nopass)

In [None]:
# Second, make sure the word 'REQUIREMENTS/MINIMUM QUALIFICATIONS' can be found in the job postings.
# Do an index() in the try clause, not find(), to catch the errors
for file_name in cleaned_jobs:
    job_path = cleaned_path + file_name        # define path to file_name
    cleaned_job  = open(job_path, 'rt').read() # read in job as a string
    try:
        if len(drivers_license_req(job=cleaned_job)) > 0:
            if (drivers_license_req(job=cleaned_job))[0]=='R':
                print(job_path)
    except:                                # do some pretty printings here to help our eyes from pain
        ## define some useful variables
        border_line = '##############################################################################################'
        how_many    = int((len(border_line) - len(job_path))/2)
        print(border_line)
        ## do pretty printings
        print('#'*how_many + job_path + '#'*how_many)
        print(border_line)

In [51]:
x = input('enter something:')
if x.lower()[0]=='y':
    print('I agree')

enter something:yefsda 
I agree


In [93]:
def _get_dl_info(job):
    '''
    Helper function for DRIVERS_LICENSE_REQ and DRIV_LIC_TYPE fields. Not intended for single use.
    '''
    # Locate the information
    start = job.index('PROCESS NOTES') 
    temp  = job[start:]

    # Find driver license and its type
    temp  = temp.split('\n')                                               # Ex: ['PROCESS NOTES', '', etc.]
    dl    = []                                                              # dl=information about driver's license
    for possibly_contains_dl in temp:
        if 'driver' in possibly_contains_dl:
            DL_info = ([e for e in 
                        possibly_contains_dl.split('.')                    # Ex: 4., Some positions..., etc.
                        if len(e) > 3])                                    # split at period and kill itemizers
            for sentence in DL_info:
                if 'may' in sentence:                                      # 'may' is a clear indicator for not required
                    dl.append(('P',''))
                    break
                else:                                                      # else, DL is required
                    dl.append(('R', sentence))
                    break
            break
    
    # Returns
    dl = dict(zip(('DRIVERS_LICENSE_REQ','DRIV_LIC_TYPE'), dl[0]))         # zip tuple and tuple, while dl is a list
    return dl

In [57]:
j = provenir.spotlight(job_name='ACCOUNTANT 1513 062218.txt', job_path=mtk.cleaned_path, job_type=mtk.cleaned_jobs)
j

'ACCOUNTANT\n\nClass Code:       1513\nOpen Date:  06-22-18\n(Exam Open to All, including Current City Employees)\n\nANNUAL SALARY\n\n$49,903 to $72,996 and $55,019 to $80,472\nThe salary in the Department of Water and Power is\n\nNOTES:\n\n1. Candidates from the eligible list are normally appointed to vacancies in the lower pay grade positions.\n2. For information regarding reciprocity between the City of Los Angeles departments and LADWP, go to http://per.lacity.org/Reciprocity_CityDepts_and_DWP.pdf.\n3. Annual salary is at the start of the pay range. The current salary range is subject to change. Please confirm the starting salary with the hiring department before accepting a job offer.\n\nDUTIES\n\nAn Accountant does professional accounting work in the analysis, preparation, maintenance, control, and reconciliation of financial records and reports in accordance with the principles of governmental and public accounting.\n\nREQUIREMENTS/MINIMUM QUALIFICATIONS\n\n1. Graduation from an

In [54]:
pd.DataFrame([('P',)])

Unnamed: 0,0
0,P


In [55]:
'^^^' in ' '

False

In [60]:
' '.join([])

''

In [95]:
k = _get_dl_info(j)
k

{'DRIVERS_LICENSE_REQ': 'P', 'DRIV_LIC_TYPE': ''}

In [81]:
k.shape

(1, 2)

In [70]:
dl

{'DRIVERS_LICENSE_REQ': 1, 'DRIV_LIC_TYPE': 2}

In [7]:
_get_dl_info(j)

Unnamed: 0,DRIVERS_LICENSE_REQ,DRIV_LIC_TYPE
0,R,A valid California driver's license is requir...


In [61]:
u = dict(zip(('DRIVERS_LICENSE_REQ','DRIV_LIC_TYPE'), (['what', ''])))
u

{'DRIVERS_LICENSE_REQ': 'what', 'DRIV_LIC_TYPE': ''}

ValueError: If using all scalar values, you must pass an index

In [None]:
def _get_dl_info(job):
    '''
    Helper function for DRIVERS_LICENSE_REQ and DRIV_LIC_TYPE fields. Not intended for single use.
    '''
    # Locate the information
    start = job.index('PROCESS NOTES') 
    temp  = job[start:]

    # Find driver license and its type
    temp  = temp.split('\n')                                               # Ex: ['PROCESS NOTES', '', etc.]
    dl   = []                                                              # dl=information about driver's license
    for possibly_contains_dl in temp:
        if 'driver' in possibly_contains_dl:
            DL_info = ([e for e in 
                        possibly_contains_dl.split('.')                    # Ex: 4., Some positions..., etc.
                        if len(e) > 3])                                    # split at period and kill itemizers
            for sentence in DL_info:
                if 'may' in sentence:                                      # 'may' is a clear indicator for not required
                    dl.append(('P',))
                    break
                else:                                                      # else, DL is required
                    start = possibly_contains_dl.index('^^^') + len('^^^')
                    end   = possibly_contains_dl.index('@@@')
                    info  = possibly_contains_dl[start:end]
                    dl.append(('R', info))
                    break
            break
    
    # Returns
    dl = pd.DataFrame(data=dl, columns=['DRIVERS_LICENSE_REQ', 
                                        'DRIV_LIC_TYPE'])                  # returns as a dataframe
    return dl