# Refine other functions that use `_base`

In [1]:
# Import relevant modules
import os           # interface with the underlying OS
import numpy as np  # linear algebra
import pandas as pd # dataframe
import re           # regular expression

In [2]:
# Import os module to allow us to interface with the underlying operating system that python is running on
import os

# Define path to look at
path = 'CityofLA/Job Bulletins/'

# Get a list of all txt files in this path
all_txt_files = os.listdir(path) # files won't be in the order shown in their actual folders
all_txt_files.sort() # sort files alphabetically. WARNING: this mutates the list. sorted(all_txt_files) won't

# Note2self: Some people use os.walk which goes through every root, and their directories, to list all files.
# We probably don't need it for now

In [3]:
# Do some checks here
print(len(all_txt_files))      # length should be 683 as manually verified
print(len(set(all_txt_files))) # see if each file is unique. hopefully 683 as well!
print('SYSTEMS ANALYST 1596 102717.txt' in all_txt_files) # should be True

683
683
True


In [4]:
# Find `SYSTEMS ANALYST 1596 102717.txt` (which exists). This is legit since we prove above that each file is unqiue
print(all_txt_files.index('SYSTEMS ANALYST 1596 102717.txt'))

609


## Import this text file as a string.
`sa` stands for systems analyst

In [5]:
sa_path = path + all_txt_files[609]
sa = open(sa_path, 'rt').read()
sa # display

'SYSTEMS ANALYST\n\nClass Code: 1596\nOpen Date: 10-27-17\n(Exam Open to All, including Current City Employees)\n\nANNUAL SALARY\n\n$68,611 to $100,307\nThe salary in the Department of Water and Power is $70,908 to $88,092 and $83,770 to $104,065.\n\nNOTES:\n\n1. Candidates from the eligible list are normally appointed to vacancies in the lower pay grade positions.  \n2. Annual salary is at the start of the pay range. The current salary range is subject to change. Please confirm the starting salary with the hiring department before accepting a job offer.\n3. For information regarding reciprocity between the City of Los Angeles departments and LADWP, go to http://per.lacity.org/Reciprocity_CityDepts_and_DWP.pdf.\n\n\nDUTIES\n\nA Systems Analyst analyzes procedures, methods and operations of computer-based information systems; designs, implements, and recommends information systems to improve the efficiency and economy of City operations; performs cost benefit and feasibility analyses re

In [6]:
# Define a helper function
def _base(job, use_df=True):
    '''
    _BASE attempts to parse the structure in REQUIREMENTS/MINIMUM QUALIFICATIONS. 
    It takes in a job and returns a dataframe of information, which supports other functions.
    
    It is intended to use inside other functions (hence started with an underscore), not for stand-alone use.
    '''
    # Each hash # below is the code in one of the cell above. For output/logic, see above cells.
    
    # From REQUIREMENTS/MINIMUM QUALIFICATIONS to PROCESS NOTES is where the information located
    temp = job[job.find('REQUIREMENTS/MINIMUM QUALIFICATIONS'):job.find('PROCESS NOTES')]
    
    # Get the digits that mark one part from the other
    digits_as_str = [sentence[-1] for sentence in temp.split('.') if sentence[-1].isdigit()]
    
    # Separate requirements one from the other.
    line_item = [] # borrow from accounting term. means items in a list
    ## Loop through idx of digits_as_str to get the text from one digit to the other
    for idx in range(len(digits_as_str)-1):
        start = digits_as_str[idx]  ; pos1 = temp.index(start)
        end   = digits_as_str[idx+1]; pos2 = temp.index(end)
        item  = temp[pos1:pos2]     ; line_item.append(item)
    ## Put the last part into the list of all possible requirements
    final_dig    = digits_as_str[-1]; final_pos = temp.index(final_dig)
    final_item = temp[final_pos:] ; line_item.append(final_item)

    # Split further by newline characters
    line_item = [[line for line in item.split('\n') if len(line) > 1] for item in line_item]
    
    # Simplify line_item by throwing away anything after the punctuation 
    # of the main sentence, if the main sentence endswith a punctuation
    for aList in line_item:
        for item in list(aList):
            if len(item) > 5:
                if item.split()[0][1] != '.':
                    aList.remove(item)
    
    # Implementation of the algorithm
    requirement_list = []
    for aList in line_item:
        # First item in the list aList may be a requirement
        # requirement_id is the first element in the string
        # requirement_txt is anything after, for example, 1., 2., 3., etc. (2 characters, that's why 2:)
        main_information = aList[0]
        requirement_id   = main_information[0]; requirement_txt = main_information[2:main_information.find(';')]
        # Step 1
        if len(aList) == 1:
            ## Step 1a: Handle case "1. and 2.", "1. or 2."
            if main_information.endswith('and'):
                ### Find next item's requirement id and the text associated with it
                next_list_idx   = line_item.index(aList)+1; next_list     = line_item[next_list_idx]
                next_info       = next_list[0]            ; next_info_txt = next_info[2:next_info.find(';')]
                ### Form a nested requirement and put into requirement list
                nested_reqmnt   = (requirement_id, requirement_txt, next_digit, next_req_txt)
                requirement_list.append(nested_reqmnt)
            ## Step 1b: Handle case "1. or 2."
            else:
                requirement = (requirement_id, requirement_txt, 'A', None)
                requirement_list.append(requirement)
        # Step 2
        elif len(aList) > 1:
            # Step 2a: Handle case "1. a. or b."
            if all([sub_item.endswith('or') for sub_item in aList[1:-1]]):
                all_sub_requirements = aList[1:]
                for sub_requirement in all_sub_requirements:
                    requirement_subset_id  = sub_requirement[0].upper()
                    requirement_subset_txt = sub_requirement[2:sub_requirement.find(';')]
                    nested_reqmnt = (requirement_id, requirement_txt, requirement_subset_id, requirement_subset_txt)
                    requirement_list.append(nested_reqmnt)
            # Step 2b: Handle case "3. a. and b."
            else:
                nested_reqmnt = (requirement_id, requirement_txt, 'A', None, aList[1:])
                requirement_list.append(nested_reqmnt)

    # Normalize each element in requirement list to length 5
    requirement_list = [tup+(None, )*(5-len(tup)) for tup in requirement_list]

    # Returns
    df = pd.DataFrame(data=requirement_list, columns=['REQUIREMENT_SET_ID',
                                                      'requirement_txt',
                                                      'REQUIREMENT_SUBSET_ID',
                                                      'requirement_subset_txt',
                                                      'mis'])
    return df

# Test
baseline = _base(job=sa)
print(baseline.shape)
baseline

(5, 5)


Unnamed: 0,REQUIREMENT_SET_ID,requirement_txt,REQUIREMENT_SUBSET_ID,requirement_subset_txt,mis
0,1,Graduation from an accredited four-year colle...,A,,
1,2,Graduation from an accredited four-year colle...,A,"the development, analysis, implementation or ...",
2,2,Graduation from an accredited four-year colle...,B,"performing cost benefit, feasibility and requ...",
3,2,Graduation from an accredited four-year colle...,C,performing system implementation and support ...,
4,3,Two years of full-time paid experience as a S...,A,,"[a. Satisfactory completion of four courses, o..."


### Get REQUIREMENT_SET_ID

In [7]:
# Define a function
def requirement_set_id():
    '''Returns REQUIREMENT_SET_ID (rsid)'''
    # Get requirement_set_id's. Need to be int type per requirement
    rsid = list(pd.to_numeric(baseline['REQUIREMENT_SET_ID']))
    
    return rsid

# Test
requirement_set_id()

[1, 2, 2, 2, 3]

### Get REQUIREMENT_SUBSET_ID

In [8]:
# Define a function
def requirement_subset_id():
    '''Returns REQUIREMENT_SUBSET_ID (rsubsetid)'''
    # Get requirement_subset_id's
    rsubsetid = list(baseline['REQUIREMENT_SUBSET_ID'])
    
    return rsubsetid

# Test
requirement_subset_id()

['A', 'A', 'B', 'C', 'A']

### Get EDUCATION_YEARS

In [9]:
# Build a (word, number) dictionary
number_dict = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 
               'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10,
               'eighteen': 18}

In [55]:
# Defind a function
def education_years():
    '''Returns EDUCATION_YEARS (ey)'''
    temp = list(baseline['requirement_txt'])              # locate information
    ey= []                                                # ey = EDUCATION_YEARS
    for possibly_contains_ey in temp:
        processed_string = (possibly_contains_ey.lower()
                                                .split()) # normalize the string
        flag = 0                                          # for flagging when match word found
        for word in processed_string:                     # Ex: ['graduation', 'four-year']
            if word == "associate's":
                flag = 1
                ey.append(2)
            elif word == "bachelor's":
                flag = 1
                ey.append(4)
            elif '-' in word:                             # Ex: 'four-year', 'full-time'
                term = word.split('-')                    # Ex: ['four', 'year'], ['full', 'time']
                if term[0] in number_dict.keys():         # Ex: 'four'
                    flag     = 1
                    unit     = term[1]                    # define unit. Ex: month, year,
                    duration = number_dict[term[0]]       # define duration of education. Ex: 'four', 'six'
                    if unit  == 'year':             
                        ey.append(duration)
                    elif unit == 'month': 
                        ey.append(duration/12)            # 6 months = 0.5 year
        
        if not flag:
            ey.append(None)                                # nullify when flag was not raised
    # Returns
    return ey

# Test
education_years()

[4, 4, 4, 4, None]

### Get SCHOOL_TYPE

In [57]:
# Build a list of school type
types_of_school = ['college', 'university', 'high', 'trade', 'apprenticeship', 'certificate', 'law']

In [75]:
# Define a function
def school_type(job):
    '''Returns SCHOOL_TYPE st'''
    temp = list(baseline['requirement_txt'])                 # locate information
    st = []                                                  # st = school type
    for possibly_contains_st in temp:
        processed_string = (possibly_contains_st.lower()
                                                .split())    # normalize the string
        flag = 0                                             # for flagging when match word found
        if 'graduation' in processed_string:
            flag = 1
            all_type_of_school = [st.title() 
                                  for st in processed_string 
                                  if st in types_of_school]  # Ex: ['COLLEGE', 'UNIVERSITY']
            st.append(' OR '.join(all_type_of_school))       # Ex: 'COLLEGE OR 'UNIVERSITY'
        
        if not flag:
            st.append(None)
    # Returns
    return st

# Test
school_type(job=sa)

['College OR University',
 'College OR University',
 'College OR University',
 'College OR University',
 None]

### Get EDUCATION_MAJOR
The idea is 

In [None]:
# Build a list of majors. To be expanded later
majors_byhand = ['computer science', 'information systems', 'geographical information systems','fine arts', 
                 'performing arts', 'art history', 'architectural history', 'art education', 'journalism', 'english', 
                 'public relations', 'communications', 'natural science', 'education', 'architecture', 
                 'architectural engineering', 'environmental design', 'building science', 'accounting', 
                 'media management', 'television/film production', 'business administration', 'information systems',
                 'civil, structural', 'traffic engineering', 'environmental studies', 'planning', 'transportation',
                 'urban planning', 'urban studies', 'urban design', 'landscape architecture', 'geography', 
                 'engineering', 'crime and intelligence analysis', 'physical or natural science', 'geology',
                 'emergency management', 'homeland security', 'public health', 'public administration', 
                 'engineering geology', 'environmental', 'ecological', 'biological', 'chemical', 'atmospheric', 
                 'environmental health', 'earth science', 'geography', 'ecological geography', 
                 'geology', 'oceanography', 'environmental policy', 'sustainability', 'environmental planning', 
                 'environmental engineering', 'urban planning and design', 'landscape architecture', 'finance', 
                 'economics', 'business administration', 'finance', 'accounting', 'economics', 'mathematics', 
                 'business administration', 'business', Industrial Hygiene, Environmental & Occupational Health, Occupational Safety & Health, Environmental Health & Safety, or Safety Engineering
          
          
         ]

In [76]:
'geographical information systems'.find('information systems')

13

In [81]:
'design computer engineering'.find('computer engineering')

7

In [82]:
'hello'[2:-1]

'll'

In [None]:
# Get major by looking it up.

## Extract a list of related sentences
temp = [l[1] for l in r]
## Main code for education majors
em = []                                                  # initialization: em = education majors
for possibly_contains_em in temp:                        # loop through each sentence in temp
    all_type_of_majors = []                              # initialization: a container that holds everything found
    try:                                                 # use try/except because we have None in our data
        for major in majors:                             # for each major in major list. Ex: 'Computer Science'
            if possibly_contains_em.find(major) != -1:   # if major is found
                all_type_of_majors.append(major.upper()) # append to the inside-loop list
        if len(all_type_of_majors) > 0:                  # if there is a major in all_type_of_majors container
            em.append('|'.join(all_type_of_majors))      # then, join as required and append to em list
        else:
            em.append(None)                              # otherwise, no major, so append None
    except:
        pass

em

In [None]:
# Define a function
def education_majors(job):
    '''Returns EDUCATION_MAJORS (em)'''
    # Invoke _base function to obtain raw information
    temp = _base(job)
    # Extract a list of related sentencees. This information is in the 'MINE_1' column
    temp = list(temp['MINE_1'])
    # Get education majors based on cell above
    em = []                                                  # initialization: em = education majors
    for possibly_contains_em in temp:                        # loop through each sentence in temp
        all_type_of_majors = []                              # initialization: a container that holds everything found
        try:                                                 # use try/except because we have None in our data
            for major in majors:                             # for each major in major list. Ex: 'Computer Science'
                if possibly_contains_em.find(major) != -1:   # if major is found
                    all_type_of_majors.append(major.upper()) # append to the inside-loop list
            if len(all_type_of_majors) > 0:                  # if there is a major in all_type_of_majors container
                em.append('|'.join(all_type_of_majors))      # then, join as required and append to em list
            else:
                em.append(None)                              # otherwise, no major, so append None
        except:
            pass

    # Returns
    return em

# Test
education_majors(job=sa)

### Get EXPERIENCE_LENGTH
Idea: Build a dictionary such as,
{'one': 1, 'two': two,...} and look up the word in each sentence. So shouldn't be too hard

In [None]:
# Get length of having experience by looking it up. The idea is that they have the format two years (space between)

## Extract a list of related sentences
temp = [l[1] for l in r]
## Main code for experience length
el = []                                                       # initialization: el=experience_length
for possibly_contains_el in temp:                             # for each sentence in temp
    flag = 0                                                  # initialization: flag. see flag = 1
    possibly_contains_el_lower = possibly_contains_el.lower() # lower case a word. So 'Two' becomes 'two'
    for word in possibly_contains_el_lower.split():           # for each word. Ex: '1', 'Graduation', etc.
        if word in number_dict.keys():                        # if word found in number_dict.keys(). Ex: 'one', 'two'
            flag = 1                                          # signal that setence has been looped through
            if 'year' in possibly_contains_el_lower:          # if 'year' is found,...
                el.append(number_dict[word])                  # put in el
            elif 'month' in possibly_contains_el_lower:
                el.append(number_dict[word]/12)               # else, convert to fractions of year     
    if not flag:                                              # if sentence has not been looped through...
        el.append(None)                                       # append None because no key in number_dict can be found

el

In [None]:
# Define a function
def experience_length(job):
    '''Returns EXPERIENCE_LENGTH (el)'''
    # Invoke _base function to obtain raw information
    temp = _base(job)
    # Extract a list of related sentencees. This information is in the 'MINE_1' column
    temp = list(temp['MINE_1'])
    # Get length of experience based on cell above
    el = []                                                       # initialization: el=experience_length
    for possibly_contains_el in temp:                             # for each sentence in temp
        flag = 0                                                  # initialization: flag. see flag = 1
        possibly_contains_el_lower = possibly_contains_el.lower() # lower case a word. So 'Two' becomes 'two'
        for word in possibly_contains_el_lower.split():           # for each word. Ex: '1', 'Graduation', etc.
            if word in number_dict.keys():                        # if word found in number_dict. Ex: 'one', 'two'
                flag = 1                                          # signal that setence has been looped through
                if 'year' in possibly_contains_el_lower:          # if 'year' is found,...
                    el.append(number_dict[word])                  # put in el
                elif 'month' in possibly_contains_el_lower:
                    el.append(number_dict[word]/12)               # else, convert to fractions of year     
        if not flag:                                              # if sentence has not been looped through...
            el.append(None)                                       # append None: no key in number_dict can be found
    
    # Returns
    return el

# Test
experience_length(job=sa)

### Get FULL_TIME_PART_TIME
Isn't that "annual" means full time?

<font color=red> NOOOOO!!! It's the required experience.</font>

In [None]:
# Extract a list of related sentences
temp = [l[1] for l in r]
# Get full_time/part_time
ftpt = []                                                         # initialization: ftpt=FULL_TIME_PART_TIME
for possibly_contains_ftpt in temp:                               # for each sentence in temp
    possibly_contains_ftpt_lower = possibly_contains_ftpt.lower() # lower case to make sure things are right
    if 'experience' in possibly_contains_ftpt_lower:              # only look for ones that have the word 'experience'
        if 'full-time' in possibly_contains_ftpt_lower:           # if 'full-time' found...
            ftpt.append('full-time'.upper())                      # append to ftpt
        elif 'part-time' in possibly_contains_ftpt_lower:
            ftpt.append('part-time'.upper())                      # elif 'part-time', append to ftpt
    else:
        ftpt.append(None)                                         # else: No experience required
    
ftpt

In [None]:
# Define a functioin
def full_time_part_time(job):
    '''Returns FULL_TIME_PART_TIME (ftpt)'''
    # Invoke _base function to know how many rows needed
    temp = _base(job)
    # Extract a list of related sentencees. This information is in the 'MINE_1' column
    temp = list(temp['MINE_1'])
    # Get full_time/part_time based on cell above
    ftpt = []                                                         # initialization: ftpt=FULL_TIME_PART_TIME
    for possibly_contains_ftpt in temp:                               # for each sentence in temp
        possibly_contains_ftpt_lower = possibly_contains_ftpt.lower() # lower case to make sure things are right
        if 'experience' in possibly_contains_ftpt_lower:              # only search for 'experience'
            if 'full-time' in possibly_contains_ftpt_lower:           # if 'full-time' found...
                ftpt.append('full-time'.upper())                      # append to ftpt
            elif 'part-time' in possibly_contains_ftpt_lower:
                ftpt.append('part-time'.upper())                      # elif 'part-time', append to ftpt
        else:
            ftpt.append(None)                                         # else: No experience required
    
    # Returns
    return ftpt

# Test
full_time_part_time(job=sa)

### Get EXP_JOB_CLASS_TITLE
See Notebook `Objective1_a`

In [None]:
exp_job_class_title_keywords = ['Body', 'Stenographer', 'Advisor', 'Specialist', 'Electrician', 'I', 
                                'Director', 'Occupational', 'Motor', 'Representative', 'Garage', 'Signal', 
                                'Manager', 'Painter', 'Marking', 'Vehicle', 'Auditor', 'Superintendent', 
                                'Management', 'Mate', 'Helicopter', 'Vessel', 'Personnel', 'IT', 'Processing', 
                                'Procurement', 'Control', 'Sales', 'Zoo', 'Coordinator', 'Geologist', 'Attendant', 
                                'Information', 'Examiner', 'Polygraph', 'Refuse', 'Laboratory', 'Stores', 'Steam', 
                                'Fleet', 'Fiscal', 'Claims', 'Quality', 'Community', 'Carpenter', 'Duty', 
                                'Applications', 'Press', 'Executive', 'Surveying', 'Sweeper', 'Technician', 'Port', 
                                'Instructor', 'Internal', 'Principal', 'Security', 'Machine', 'Recreation', 
                                'Compensation', 'Operating', 'Deck', 'Zoning', 'Distribution', 'Street', 
                                'Lieutenant', 'Operator', 'Commander', 'Water', 'Officer', 'Special', 'Detention', 
                                'Relations', 'Land', 'Maintenance', 'Auto', 'Trainee', 'Examiner', 'Typist', 
                                'Housing', 'Dispatcher', 'Aid', 'Craft', 'Treatment', 'Aide', 'Utility', 'General', 
                                'Controller', 'Fire', 'Assistant', 'IV', 'Heating', 'Collection', 'Wharfinger', 
                                'Health', 'Automotive', 'Supply', 'Messenger', 'Industrial', 'Departmental', 
                                'Laborer', 'Print', 'Window', 'Instrument', 'Construction', 'Plumbing', 'Commercial', 
                                'Cabinet', 'Accountant', 'Resources', 'Communications', 'Worker', 'Administrator', 
                                'Designer', 'Secretary', 'Keeper', 'Metal', 'V', 'Conditioning', 'Chief', 'Events', 
                                'Cartographer', 'Architectural', 'Waterworks', 'III', 'Field', 'Repairer', 'Project', 
                                'Polygraph', 'Repair', 'Drafting', 'Hand', 'Computer', 'Environmental', 'Electric', 
                                'Inspector', 'Estate', 'Programs', 'Custodian', 'Supervising', 'Technical', 
                                'Pumping', 'Starter', 'Material', 'Services', 'Bindery', 'Surveys', 'Education', 
                                'Research', 'Engineering', 'Technician', 'Environmental', 'Driver', 'Apparatus', 
                                'Real', 'Grounds', 'Truck', 'Accounting', 'Solid', 'Firefighter', 'Labor', 
                                'Geographic', 'Bus', 'Biologist', 'Machinist', 'Caretaker', 'Shovel', 'Pipefitter', 
                                'Division', 'Clerk', 'Fireboat', 'Gardener', 'Equipment', 'Development', 
                                'Compliance', 'Helper', 'Lot', 'Apprentice', 'II', 'Wastewater', 'Payroll', 
                                'Traffic', 'Poster', 'Load', 'Nurse', 'Pressure', 'Records', 'Service', 'Locksmith', 
                                'Building', 'Sheet', 'Boat', 'Captain', 'Party', 'Planner', 'Workers', 'Park', 
                                'Station', 'Compressor', 'Event', 'Structural', 'Reptiles', 'Sergeant', 'Airport', 
                                'Refrigeration', 'Delivery', 'Facility', 'Storekeeper', 'Public', 'Deputy', 
                                'Engineer', 'Administrative', 'Head', 'Lighting', 'Senior', 'Testing', 'Toolroom', 
                                'Civil', 'Worker', 'Airports', 'Rehabilitation', 'Power', 'Harbor', 'Cleaning', 
                                'Sign', 'Data', 'Meter', 'Transmission', 'System', 'Battalion', 'Librarian', 
                                'Warehouse', 'Bureau', 'Forensic', 'Builder', 'Elevators', 'Tax', 'Property', 
                                'Airports', 'Veterinary', 'Photographer', 'Curator', 'Elevator', 'Financial', 
                                'Light', 'Mechanic', 'Heavy', 'Irrigation', 'Telecommunications', 'Buyer', 
                                'Supervisor', 'Graphics', 'Cement', 'Materials', 'Plant', 'Reader', 
                                'Planning', 'District', 'Associate', 'Air', 'Parking', 'Building', 'Custodial', 
                                'Printing', 'Marketing', 'Transportation', 'Plumber', 'Detective', 'Police', 
                                'Ranger', 'Systems', 'Mechanical', 'Vessels', 'Birds', 'Managing', 'Duplicating', 
                                'Finisher', 'Analyst', 'Survey', 'Waste', 'Protection', 'Shop', 
                                'Commission', 'Cleaner', 'Electrical', 'Safety', 'Maker', 'Office', 'Golf', 
                                'Operations', 'Auditor', 'Senior', 'Operator', 'Senior', 'Sanitation', 'Line', 
                                'Crew', 'Surgeon', 'Animal', 'Title', 'Disposal', 'Tree', 'Care', 'Masonry', 
                                'Pilot', 'Programmer', 'Utilization']
print(len(exp_job_class_title_keywords))
exp_job_class_title_keywords = list(set(exp_job_class_title_keywords))
print(len(exp_job_class_title_keywords))

In [None]:
occupations = ['Stenographer', 'Advisor', 'Specialist', 'Electrician', 'Director', 'Representative', 'Manager', 
               'Painter', 'Auditor', 'Superintendent', 'Hand', 'Coordinator', 'Geologist', 'Lieutenant', 'Reptiles',
               'Attendant', 'Examiner', 'Carpenter', 'Personnel', 'Technician', 'Instructor', 'Vessels', 'Vessel',
               'Security', 'Operator', 'Commander', 'Officer', 'Trainee', 'Examiner', 'Typist', 'Dispatcher', 'Aide', 
               'Controller', 'Assistant', 'Wharfinger', 'Messenger', 'Laborer', 'Accountant', 'Worker', 'Operations',
               'Administrator', 'Designer', 'Secretary', 'Keeper', 'Cartographer', 'Architect', 'Repairer', 'Aid',
               'Inspector', 'Starter', 'Technician', 'Driver', 'Firefighter', 'Biologist', 'Machinist', 'Caretaker', 
               'Pipefitter', 'Clerk', 'Gardener', 'Helper', 'Apprentice', 'Poster', 'Nurse', 'Locksmith', 'Captain', 
               'Planner', 'Sergeant', 'Storekeeper', 'Deputy', 'Engineer', 'Worker', 'Elevator', 'Elevators', 'Birds',
               'Librarian', 'Builder', 'Photographer', 'Curator', 'Mechanic', 'Buyer', 'Supervisor', 
               'Reader', 'Associate', 'Custodian', 'Plumber', 'Detective', 'Police', 'Ranger', 'Finisher', 'Analyst',  
               'Cleaner', 'Safety', 'Maker', 'Auditor', 'Operator', 'Senior', 'Surgeon', 'Pilot', 'Programmer'
              ]
print(len(occupations))
# Drop duplicates
occupations = list(set(occupations))
print(len(occupations))

In [None]:
seniority_levels = ['I', 'II', 'III', 'IV', 'V']

In [None]:
# Get title of the job class that requires experience associated with

## Extract a list of related sentences
temp = [l[1] for l in r]
## Main code for getting title
ejct = []                                                        # initialization: ejct=experience_job_class_title
for possibly_contains_ejct in temp:                              # for each sentence in temp
    # Fill all relevant words in a container l
    l = []                                                       # initialize a container that holds relevant words in title
    flag = 0                                                     # initialize a flag. see flag = 1
    possibly_contains_ejct = (possibly_contains_ejct             # strip off unwanted symbos
                              .replace('.', '')                  # is there a way to do this in one line?
                              .replace(',', '')
                              .replace(';', '')
                              .replace(':', '')
                             )
    for word in possibly_contains_ejct.split():                  # for each word in the splitter. Ex: '1', 'Graduation'
        if word in exp_job_class_title_keywords:                 # if word is in list of keywords. Ex: Systems, Information
            l.append(word)                                       # then append to l
    # Analyze list l and get title
    temp_list = list(l); temp_list.reverse()                     # reverse l. list(l) for actual copy since reverse mutates l
    for possibly_an_occupation in temp_list:                     # for each word in temp_list. Ex: 'Assitant', 'Management'
        if possibly_an_occupation in occupations:                # see if that word is in the exhausted list of occupations
            flag = 1                                             # if it is, change flag to signal that
            title = l[:l.index(possibly_an_occupation)+1]        # get title. Ex: ['Management', 'Assitant']. Need +1.
            seniority  = [s for s in l if s in seniority_levels] # get seniority. Ex: 'I', 'II'
            full_title = ' '.join(title + seniority)             # get full title: title and seniority
            ejct.append(full_title)                              # append full title to ejct
        else:
            continue                                             # otherwise, jump to the next word
    if not flag:
        ejct.append(None)                                        # if never found an occupation, then None for no occupation

ejct

In [None]:
# Define a function
def exp_job_class_title(job):
    '''Returns EXP_JOB_CLASS_TITLE (ejct)'''
    # Invoke _base function to know how many rows needed
    temp = _base(job)
    # Extract a list of related sentencees. This information is in the 'MINE_1' column
    temp = list(temp['MINE_1'])
    # Get title of the job class based on code above
    ejct = []                                                        # initialization: ejct=experience_job_class_title
    for possibly_contains_ejct in temp:                              # for each sentence in temp
        # Fill all relevant words in a container l
        l = []                                                       # initialize container that holds relevant words in title
        flag = 0                                                     # initialize a flag. see flag = 1
        possibly_contains_ejct = (possibly_contains_ejct.            # strip off unwanted symbols
                                  replace('.', '').
                                  replace(',', '').
                                  replace(';', '').
                                  replace(':', ''))                   
        for word in possibly_contains_ejct.split():                  # for each word in the splitter. Ex: '1', 'Graduation'
            if word in exp_job_class_title_keywords:                 # if word is in list of keywords. Ex: Systems, Information
                l.append(word)                                       # then append to l
        # Analyze list l and get title
        temp_list = list(l); temp_list.reverse()                     # list(l) for actual copy since reverse mutates l
        for possibly_an_occupation in temp_list:                     # for each word in temp_list. Ex: 'Assitant', 'Management'
            if possibly_an_occupation in occupations:                # see if that word is in the exhausted list of occupations
                flag = 1                                             # if it is, change flag to signal that
                title = l[:l.index(possibly_an_occupation)+1]        # get title. Ex: ['Management', 'Assitant']. Need +1.
                seniority  = [s for s in l if s in seniority_levels] # get seniority. Ex: 'I', 'II'
                full_title = ' '.join(title + seniority)             # get full title: title and seniority
                ejct.append(full_title)                              # append full title to ejct
            else:
                continue                                             # otherwise, jump to the next word
        if not flag:
            ejct.append(None)                                        # never found an occupation, then None for no occupation
    
    # Returns
    return ejct

# Test
exp_job_class_title(job=sa)

### Get EXP_JOB_CLASS_ALT_RESP
I have no idea how it would look like since Systems Analyst doesn't have it (see annotations), so, just return `None` for now.

In [None]:
# Define a function
def exp_job_class_alt_resp(job):
    '''Returns EXP_JOB_CLASS_ALT_RESP (ejcar)'''
    # Invoke _base function to know how many rows needed
    temp = _base(job)
    # Extract a list of related sentencees. This information is in the 'MINE_1' column
    temp = list(temp['MINE_1'])
    
    n = len(temp)
    ejcar = [None]*n
    
    return ejcar

# Test
exp_job_class_alt_resp(job=sa)

### Get EXP_JOB_CLASS_FUNCTION

In [None]:
# Define a function
def exp_job_class_function(job):
    '''Returns EXP_JOB_CLASS_FUNCTION (ejcf)'''
    # Invoke _base function to know how many rows needed
    temp = _base(job)
    # Extract a list of related sentencees.
    temp = list(temp['EXP_JOB_CLASS_FUNCTION'])
    
    # Returns
    return temp

# Test
exp_job_class_function(job=sa)

### Get COURSE_COUNT

In [None]:
#Define a function
def course_count(job):
    '''Returns COURSE_COUNT (cc)'''
    # Invoke _base function to know how many rows needed
    temp = _base(job)
    # Extract a list of related sentencees. This information is in the 'MINE_2' column
    temp = list(temp['MINE_2'])
    # Main code for getting the counts of courses
    cc = []                                                     # intialization: cc=course_count
    for possibly_contains_cc in temp:                           # for each sentence. Ex: None, 'Satisfactory...'
        flag = 0                                                # Create a flag. see flag=1
        if possibly_contains_cc!=None:                          # if sentence is not None, so can be stripped. 
                                                                # avoid using try/except
            possibly_contains_cc = (possibly_contains_cc.       # strip off unwanted symbols
                                    replace('.', '').
                                    replace(',', '').
                                    replace(';', '').
                                    replace(':', ''))
            possibly_contains_cc = possibly_contains_cc.split() # split the sentence...
            if 'courses' in possibly_contains_cc:               # to look for the word courses
                flag = 1                                        # if there is, flag to signal
                for word in possibly_contains_cc:               # for each word in splitted sentence
                    if word in number_dict.keys():              # if that word can be found in number_dict
                        cc.append(number_dict[word])            # append to cc...
                        break                                   # and then break loop because of, e.g, three semesters
        if not flag:
            cc.append(None)                                     # append 0 if never flagged
            
    # Returns
    return cc

# Test
course_count(job=sa)

### Get COURSE_LENGTH

In [None]:
temp = [r[4][4]]
cl = []
for possibly_contains_cl in temp:
    flag = 0
    if possibly_contains_cl!=None:
        possibly_contains_cl = (possibly_contains_cl. 
                        replace('.', '').
                        replace(',', '').
                        replace(';', '').
                        replace(':', ''))
        possibly_contains_cl = possibly_contains_cl.split()
        if ('semester' in possibly_contains_cl) | ('quarter' in possibly_contains_cl):
            flag = 1
            l    = []
            semester_index = possibly_contains_cl.index('semester')
            quarter_index  = possibly_contains_cl.index('quarter')
            relevant_info  = possibly_contains_cl[semester_index-2:quarter_index+2] # 2 just to make sure
            for word in relevant_info:
                if word in number_dict.keys():
                    l.append(number_dict[word])
            cl.append('|'.join([str(l[0])+'S', str(l[1])+'Q']))
            print(cl)
    
    if not flag:
        cl.append(None)

In [None]:
# Define a function
def course_length(job):
    '''Returns COURSE_LENGTH (cl)'''
    # Invoke _base function to know how many rows needed
    temp = _base(job)
    # Extract a list of related sentencees. This information is in the 'MINE_2' column
    temp = list(temp['MINE_2'])
    # Main code for getting the length of courses
    cl = []
    for possibly_contains_cl in temp:
        flag = 0
        if possibly_contains_cl!=None:
            possibly_contains_cl = (possibly_contains_cl. 
                            replace('.', '').
                            replace(',', '').
                            replace(';', '').
                            replace(':', ''))
            possibly_contains_cl = possibly_contains_cl.split()
            if ('semester' in possibly_contains_cl) | ('quarter' in possibly_contains_cl):
                flag = 1
                l    = []
                semester_index = possibly_contains_cl.index('semester')
                quarter_index  = possibly_contains_cl.index('quarter')
                relevant_info  = possibly_contains_cl[semester_index-2:quarter_index+2] # 2 just to make sure
                for word in relevant_info:
                    if word in number_dict.keys():
                        l.append(number_dict[word])
                cl.append('|'.join([str(l[0])+'S', str(l[1])+'Q']))

        if not flag:
            cl.append(None)
    # Returns
    return cl
# Test
course_length(cl)

### Get COURSE_SUBJECT
The approach is very similar to setting up EDUCATION_MAJORS

In [None]:
courses = ['information systems', 'systems analysis'] # there are MANY jobs with lower case courses

In [None]:
temp = [r[4][4]]
cs = []
for possibly_contains_cs in temp:
    flag = 0
    all_types_of_courses = []
    if possibly_contains_cs != None:
        possibly_contains_cs = (possibly_contains_cs. 
                                replace('.', '').
                                replace(',', '').
                                replace(';', '').
                                replace(':', ''))
        possibly_contains_cs = possibly_contains_cs.lower()
        if 'courses' in possibly_contains_cs.split():
            flag = 1
            for course_name in courses:
                if possibly_contains_cs.find(course_name) != -1:
                    all_types_of_courses.append(course_name.upper())
            if 'closely' in possibly_contains_cs.split():
                possibly_contains_cs = possibly_contains_cs.split()
                mask = possibly_contains_cs.index('closely')
                all_types_of_courses.append(' '.join(possibly_contains_cs[mask:]))
            if len(all_types_of_courses) > 0:
                cs.append('|'.join(all_types_of_courses))
    if not flag:
        cs.append(None)
        
cs

In [None]:
# Define a function
def course_subject(job):
    '''Returns COURSE_SUBJECT (cs)'''
    # Invoke _base function to know how many rows needed
    temp = _base(job)
    # Extract a list of related sentencees. This information is in the 'MINE_2' column
    temp = list(temp['MINE_2'])
    # Main code for getting the subjects of courses
    cs = []
    for possibly_contains_cs in temp:
        flag = 0
        all_types_of_courses = []
        if possibly_contains_cs != None:
            possibly_contains_cs = (possibly_contains_cs. 
                                    replace('.', '').
                                    replace(',', '').
                                    replace(';', '').
                                    replace(':', ''))
            possibly_contains_cs = possibly_contains_cs.lower()
            if 'courses' in possibly_contains_cs.split():
                flag = 1
                for course_name in courses:
                    if possibly_contains_cs.find(course_name) != -1:
                        all_types_of_courses.append(course_name.upper())
                if 'closely' in possibly_contains_cs.split():
                    possibly_contains_cs = possibly_contains_cs.split()
                    mask = possibly_contains_cs.index('closely')
                    all_types_of_courses.append(' '.join(possibly_contains_cs[mask:]))
                if len(all_types_of_courses) > 0:
                    cs.append('|'.join(all_types_of_courses))
        if not flag:
            cs.append(None)
        
    # Returns
    return cs

# Test
course_subject(job=sa)

### Get MISC_COURSE_DETAILS

In [None]:
# Define a function
def misc_course_details( job):
    '''Returns MISC_COURSE_DETAILS (mcd)'''
    # Invoke _base function to know how many rows needed
    temp = _base(job)
    # Extract a list of related sentencees. This information is in the 'MINE_2' column
    temp = list(temp['MISC'])
    
    # Returns
    return temp

# Test
misc_course_details(job=sa)

### Get DRIVERS_LICENSE_REQ
This is quite tricky because sometimes, this information is given at the end of the job description.

In [None]:
# Locate the information
temp = sa[sa.find('PROCESS NOTES'):]
temp = temp.split('\n')

# Determine whether driver license is required for the job
# The approach I took here was to look at each sentence based on their index in temp.
dlr = []
for idx in range(len(temp)):
    if 'driver' in temp[idx]:
        print(temp[idx])
        splitted_sentence = temp[idx].split()
        if 'require' in splitted_sentence:
            dlr.append('P')
        elif 'required' in splitted_sentence:
            dlr.append('R')
        break

dlr

In [None]:
# Define a function
def drivers_license_req(job):
    '''Returns DRIVERS_LICENSE_REQ(dlr)'''
    # Locate the information
    temp = sa[sa.find('PROCESS NOTES'):]
    temp = temp.split('\n')

    # Determine whether driver license is required for the job
    # The approach I took here was to look at each sentence based on their index in temp.
    dlr = []
    for idx in range(len(temp)):
        if 'driver' in temp[idx]:
            splitted_sentence = temp[idx].split()
            if 'require' in splitted_sentence:
                dlr.append('P')
            elif 'required' in splitted_sentence:
                dlr.append('R')
            break
    
    # Returns
    return dlr

# Test
drivers_license_req(job=sa)

### Get DRIV_LIC_TYPE
* This depends on the information mined from DRIVERS_LICENSE_REQ, so first we have to invoke that function.
* For now, I just want to take it easy. Thus the function below will need to be modified to accomodate, for example, `TRUCK OPERATOR 3583 012618.txt`. Hopefully, this will not be too difficult.

In [None]:
# Define a function
def driv_lic_type(job):
    '''Returns DRIVE_LIC_TYPE (dlt)'''
    # Invoke drivers_license_req to obtain relevant information
    temp = drivers_license_req(job)
    # Get type of driver license based on the information 
    dlt=[]
    if temp[0]=='P':
        dlt.append(None)
    else:
        pass
    
    # Returns
    return dlt

# Test
driv_lic_type(job=sa)

### Get ADDTL_LIC
For now, just give Nones.

In [None]:
# Define a function
def addtl_lic(job):
    '''Returns ADDTL_LIC (al)'''
    al = []
    al.append(None)
    
    return al

# Test
addtl_lic(job=sa)

### Get EXAM_TYPE
We have two options:
* Copy the code on Kaggle. I remember there was a guy who did a very good job at this, but used regular expression. The downside of this option is that I have to learn what he wrote. I got stuck in writing my own function, so here's the link [danielbecker](https://www.kaggle.com/danielbecker/l-a-jobs-data-exctraction-eda).
* Write my own function. The upside of this option is that it's fairly easy to understand.

Let's use Kaggle because I'm too tired now. Actually, it's quite similar to option 1. Fundamentally, it uses a dictionary to look up the key words. The difference is, on the other hand, reg ex manipulations rather than string manipulations.

In [None]:
def exam_type(job):
    '''
    Returns EXAM_TYPE (et). 
    This code is borrowed from Daniel Becker on Kaggle, with some variables' renaming 
    to fit into my code style for this project.
    ''' 
    regex_dic = {'OPEN_INT_PROM':r'BOTH.*INTERDEPARTMENTAL.*PROMOTIONAL', 
                 'INT_DEPT_PROM':r'INTERDEPARTMENTAL.*PROMOTIONAL', 
                 'DEPT_PROM':r'DEPARTMENTAL.*PROMOTIONAL',
                 'OPEN':r'OPEN.*COMPETITIVE.*BASIS'
                }
    et = []
    for key, value in regex_dic.items():
        regex = value
        regex_find = re.findall(regex, job, re.DOTALL)
        if regex_find:
            et.append(key)
            break
    return et

# Test
exam_type(job=sa)

### Get ENTRY_SALARY_GEN

In [None]:
# Locate the info location
temp = sa[sa.find('ANNUAL SALARY'):sa.find('NOTES')]
# Next, we'll replace '.' and '$' with white spaces, before splitting this string at white space.
# Luckily, we can chain these operations for readability.
temp = temp.replace('.', ' ').replace('$', ' ').split()
# Finally, let's get the salary range using the same approach that isolated
# job_titles. Per Bob's suggestion, let's have a catch list manual_check.
salary_range = []
manual_check = []
for word in temp:
    ## Some jobs are like this: $##,### to $##,###, $##,### to $##,###.
    ## Also, watch out for ROOFER!
    if (',' in word):
        try: # This is like an extra layer of checking: avoiding, e.g., "t,o"
            int(word.replace(',', ''))
            salary_range.append(word)
        except:
            manual_check.append(word)
print(salary_range)
print(manual_check)

In [None]:
# Define functions
# Need to modify to account for flat-rated salary (ROOFER)
def get_all_salaries(job):
    '''Returns all salaries that can be found'''
    # From ANNUAL SALARY to NOTES is where the information located
    temp = job[job.find('ANNUAL SALARY'):job.find('NOTES')]
    # Next, we'll replace '.' and '$' with white spaces, before splitting this string at white space.
    # Luckily, we can chain these operations for readability.
    temp = temp.replace('.', ' ').replace('$', ' ').split()
    # Finally, get the salary range by trying to convert each word in temp into an integer
    salary_range = []
    for word in temp:
        ## Some jobs are like this: $##,### to $##,###, $##,### to $##,###.
        ## Also, watch out for ROOFER!
        if (',' in word):
            try: # This is like an extra layer of checking: avoiding, e.g., "t,o"
                int(word.replace(',', ''))
                salary_range.append(word)
            except:
                manual_check.append(word)
    
    return salary_range

# Test
print(get_all_salaries(job=sa))

Note that this approach doesn't really work because we have flat-rated salary, general entry level salary ranges, and DWP (Department of Water and Power) salary ranges, while the dataframe requirement is to select only the first listed salary range. Thus, we need to define functions and write very simple but error-traceable code, even though this may mean a lot of repetitions.

In [None]:
# Define a helper function
def _get_salary(salary_text):
    '''Returns job's salary in the form of $#####-$#####, $##### (flat-rated)'''
    # The idea is to use isdigit() function to recognize a number. So need to strip off everthing that fails this.
    # Replace '.' with white space. This resolves '#####.' (dot at the end)
    # Replace '$' with white space. This resolves '$#####' (dollar sign in the beginning)
    # Replace ',' with empty space. This resolves '$##,###' (comma in the middle of the number)
    # Empty space because we will split at white space later
    temp = salary_text.replace('.', ' ').replace('$', ' ').replace(',', '')
    
    # Get salaries in temp by using the isdigit() function. 
    salary_range = []
    for word in temp.split():      # split here
        if len(salary_range) >= 2: # break to make sure that only the first listed salary range is included
            break
        else:                      # otherwise, put it in the salary_range list
            if word.isdigit():
                salary_range.append(word)
    
    # Returns the required format
    return '-'.join(salary_range)

In [None]:
# Now use helper function to get ENTRY_SALARY_GEN
def entry_salary_gen(job):
    '''Returns ENTRY_SALARY_GEN (esg)'''
    # From ANNUAL SALARY to Department of Water and Power is where the information located
    # Instead of find, I switch to index to take advantage of try/except later.
    temp = job[job.index('ANNUAL SALARY'):job.index('Department of Water and Power')]
    
    # Returns
    esg = _get_salary(salary_text=temp)
    return esg

# Test
entry_salary_gen(job=sa)

### Get ENTRY_SALARY_DWP

In [None]:
# Now use helper function to get ENTRY_SALARY_DWP
def entry_salary_dwp(job):
    '''Returns ENTRY_SALARY_DWP (esd)'''
    # From Department of Water and Power to NOTES is where the information located
    # Instead of find, switch to index to take advantage of try/except later.
    temp = job[job.index('Department of Water and Power'):job.index('NOTES')]
    
    # Returns
    esd = _get_salary(salary_text=temp)
    return esd

# Test
entry_salary_dwp(job=sa)

Note: Later on, when filling out the dataframe, we'll use `try/except/finally` (try the specific function, except then use get_all_salaries(), finally then return an null value.

### Get OPEN_DATE

In [None]:
temp = sa[sa.find('Open Date'):sa.find('(')]
od   = temp.split()[-1]
od

In [None]:
# Define a function
def open_date(job):
    '''Returns OPEN_DATE (od)'''
    # Open Date: 10-27-17\n(Exam Open to All
    # From Open Date to the first '(' is where the information located
    temp = job[job.find('Open Date'):job.find('(')]
    # Get the last element
    od   = temp.split()[-1]
    
    return od

# Test
open_date(job=sa)

# <font color='red'> ~~TO BE CONTINUED~~</font>. <font color='green'>DONE ON Sunday, 6/2/2019.</font>