# EDA for `SYSTEMS ANALYST 1596 102717.txt`

In [1]:
# First, import relevant modules
import os
import numpy as np
import pandas as pd

In [2]:
# Import os module to allow us to interface with the underlying operating system that python is running on
import os

# Define path to look at
path = 'CityofLA/Job Bulletins/'

# Get a list of all txt files in this path
all_txt_files = os.listdir(path) # files won't be in the order shown in their actual folders
all_txt_files.sort() # sort files alphabetically. WARNING: this mutates the list. sorted(all_txt_files) won't

# Note2self: Some people use os.walk which goes through every root, and their directories, to list all files.
# We probably don't need it for now

In [3]:
# Do some checks here
print(len(all_txt_files))      # length should be 683 as manually verified
print(len(set(all_txt_files))) # see if each file is unique. hopefully 683 as well!
print('SYSTEMS ANALYST 1596 102717.txt' in all_txt_files) # should be True

683
683
True


In [4]:
# Find `SYSTEMS ANALYST 1596 102717.txt` (which exists). This is legit since we prove above that each file is unqiue
print(all_txt_files.index('SYSTEMS ANALYST 1596 102717.txt'))

609


## Import this text file as a string.
`sa` stands for systems analyst

In [5]:
sa_path = path + all_txt_files[609]
sa = open(sa_path, 'rt').read()
sa # display

'SYSTEMS ANALYST\n\nClass Code: 1596\nOpen Date: 10-27-17\n(Exam Open to All, including Current City Employees)\n\nANNUAL SALARY\n\n$68,611 to $100,307\nThe salary in the Department of Water and Power is $70,908 to $88,092 and $83,770 to $104,065.\n\nNOTES:\n\n1. Candidates from the eligible list are normally appointed to vacancies in the lower pay grade positions.  \n2. Annual salary is at the start of the pay range. The current salary range is subject to change. Please confirm the starting salary with the hiring department before accepting a job offer.\n3. For information regarding reciprocity between the City of Los Angeles departments and LADWP, go to http://per.lacity.org/Reciprocity_CityDepts_and_DWP.pdf.\n\n\nDUTIES\n\nA Systems Analyst analyzes procedures, methods and operations of computer-based information systems; designs, implements, and recommends information systems to improve the efficiency and economy of City operations; performs cost benefit and feasibility analyses re

## Get required columns

In [6]:
# Create markers from one part to another
job_title = 'SYSTEMS ANALYST'
mask = [job_title, 
        'Class Code',
        'Open Date',
        '(',
        'ANNUAL SALARY',
        'NOTES',
        'DUTIES',
        'REQUIREMENTS/MINIMUM QUALIFICATIONS',
        'PROCESS NOTES']

### Get the job_title

In [7]:
temp = sa[sa.find(job_title):sa.find('Class Code')]
jt   = temp.split('\n')[0]
jt

'SYSTEMS ANALYST'

### Get the JOB_CLASS_NO

In [8]:
# Good thing is that str.split() ignores newline characters!
temp = sa[sa.find('Class Code'):sa.find('Open Date')]
jcn  = [int(string_num) for string_num in temp.split() if string_num.isdigit()][0]
jcn

1596

### Get Open Date - od

In [9]:
temp = sa[sa.find('Open Date'):sa.find('(')]
od   = temp.split()[-1]
od

'10-27-17'

### Get salary

In [10]:
# Locate the info location
temp = sa[sa.find('ANNUAL SALARY'):sa.find('NOTES')]
# Next, we'll replace '.' and '$' with white spaces, before splitting this string at white space.
# Luckily, we can chain these operations for readability.
temp = temp.replace('.', ' ').replace('$', ' ').split()
# Finally, let's get the salary range using the same approach that isolated
# job_titles. Per Bob's suggestion, let's have a catch list manual_check.
salary_range = []
manual_check = []
for word in temp:
    ## Some jobs are like this: $##,### to $##,###, $##,### to $##,###.
    ## Also, watch out for ROOFER!
    if (',' in word):
        try: # This is like an extra layer of checking: avoiding, e.g., "t,o"
            int(word.replace(',', ''))
            salary_range.append(word)
        except:
            manual_check.append(word)
print(salary_range)
print(manual_check)

['68,611', '100,307', '70,908', '88,092', '83,770', '104,065']
[]


### Get duties

In [11]:
temp = sa[sa.find('DUTIES'):sa.find('REQUIREMENTS/MINIMUM QUALIFICATIONS')]
temp = temp.split()[1:]
duties = ' '.join(temp)
duties

'A Systems Analyst analyzes procedures, methods and operations of computer-based information systems; designs, implements, and recommends information systems to improve the efficiency and economy of City operations; performs cost benefit and feasibility analyses related to the modification of existing computer-based information systems, the maintenance and support of information systems, or the implementation of new computer-based systems; accesses and analyzes information from automated files using high-level retrieval languages.'

### Get requirements
<font color='red'>For now, just treat every txt has the word PROCESS NOTES</font>

In [12]:
# Locate the information
temp = sa[sa.find('REQUIREMENTS/MINIMUM QUALIFICATIONS'):sa.find('PROCESS NOTES')]
temp

'REQUIREMENTS/MINIMUM QUALIFICATIONS\n\n1. Graduation from an accredited four-year college or university with a major in Computer Science, Information Systems, or Geographical Information Systems; or\n2. Graduation from an accredited four-year college or university and two years of full-time paid experience in a class at the level of Management Assistant which provides experience in:\na. the development, analysis, implementation or major modification of new or existing computer-based information systems or relational databases; or\nb. performing cost benefit, feasibility and requirements analysis for a large-scale computer-based information system; or\nc. performing system implementation and support activities including software and hardware acquisition, installation, modifications to system configuration, system and application upgrade installation; or\n3. Two years of full-time paid experience as a Systems Aide with the City of Los Angeles; and\na. Satisfactory completion of four cou

In [13]:
# Remove the dots in 1., 2., a., etc.
temp = temp.replace('.', '')
temp

'REQUIREMENTS/MINIMUM QUALIFICATIONS\n\n1 Graduation from an accredited four-year college or university with a major in Computer Science, Information Systems, or Geographical Information Systems; or\n2 Graduation from an accredited four-year college or university and two years of full-time paid experience in a class at the level of Management Assistant which provides experience in:\na the development, analysis, implementation or major modification of new or existing computer-based information systems or relational databases; or\nb performing cost benefit, feasibility and requirements analysis for a large-scale computer-based information system; or\nc performing system implementation and support activities including software and hardware acquisition, installation, modifications to system configuration, system and application upgrade installation; or\n3 Two years of full-time paid experience as a Systems Aide with the City of Los Angeles; and\na Satisfactory completion of four courses, o

In [14]:
# Get the digits that marks one part from the other
digits = [str(int(string_num)) for string_num in temp.split() if string_num.isdigit()]
digits

['1', '2', '3']

In [15]:
# Separate requirements one from the others
requirement_set = []
for idx in range(len(digits)):
    try:                                                # this works fine until the last index as it can't move on
        mask1 = digits[idx]                             # mark the first position
        mask2 = digits[idx+1]                           # mark the second position
        req   = temp[temp.find(mask1):temp.find(mask2)] # get the part from the first pos. to the second pos.
        requirement_set.append(req)                     # put that part into the container
    except:                                             # a pythonic way to get the last part w/o excessive coing
        mask  = digits[idx]              
        req   = temp[temp.find(mask):]
        requirement_set.append(req)

requirement_set # display results

['1 Graduation from an accredited four-year college or university with a major in Computer Science, Information Systems, or Geographical Information Systems; or\n',
 '2 Graduation from an accredited four-year college or university and two years of full-time paid experience in a class at the level of Management Assistant which provides experience in:\na the development, analysis, implementation or major modification of new or existing computer-based information systems or relational databases; or\nb performing cost benefit, feasibility and requirements analysis for a large-scale computer-based information system; or\nc performing system implementation and support activities including software and hardware acquisition, installation, modifications to system configuration, system and application upgrade installation; or\n',
 '3 Two years of full-time paid experience as a Systems Aide with the City of Los Angeles; and\na Satisfactory completion of four courses, of at least three semester or

For now, put things into the following format: `[('1'), ('2a', '2b', '2c'), ('3ab')]`

In [16]:
# First, let's simplify requirement_set. We still need to keep the newline characters (\n) as a mark from one sentence
# to another. However, we also need to remove those \n at the end of the sentences to foster easier analysis.
simplified_requirement_set = [[item for item in element.split('\n') if len(item) > 1] for element in requirement_set]
simplified_requirement_set

## Note: nested list is exactly what I need because it provides a natural mark from the sentences with digits

[['1 Graduation from an accredited four-year college or university with a major in Computer Science, Information Systems, or Geographical Information Systems; or'],
 ['2 Graduation from an accredited four-year college or university and two years of full-time paid experience in a class at the level of Management Assistant which provides experience in:',
  'a the development, analysis, implementation or major modification of new or existing computer-based information systems or relational databases; or',
  'b performing cost benefit, feasibility and requirements analysis for a large-scale computer-based information system; or',
  'c performing system implementation and support activities including software and hardware acquisition, installation, modifications to system configuration, system and application upgrade installation; or'],
 ['3 Two years of full-time paid experience as a Systems Aide with the City of Los Angeles; and',
  'a Satisfactory completion of four courses, of at least 

Algorithm:

for `aList` in the list of `simplified_requirement_set`:
* Step 1: if length is 1, then it *may be* a requirement_id. This is to handle situations: "1. and 2." or "1. or 2."
    * Step 1a: if it ends with "and", then this line is related to the next line
    * Step 1b: else, then it is indeed a requirement_id
* Step 2: if length is more than 1, then it defines a sub-requirement. This is to handle situations: "2a or 2b" or "3 and a or b"
    * Step 2a: if all the sub-requirements end with 'or', then each defines a sub_requirement_id
    * Step 2b: else, then if defines a *nested* sub_requirement_id, such as 3ab.

In [17]:
r = []
for aList in simplified_requirement_set:
    if len(aList) == 1:
        possibly_a_requirement = aList[0]
        if possibly_a_requirement.endswith('and'):
            next_item_idx = simplified_requirement_set.index(aList) + 1
            next_digit    = simplified_requirement_set[next_item_idx][0]
            nested_requirement = possibly_a_requirement[0] + next_digit
            r.append(tuple(nested_requirement))
        else:
            requirement_id = possibly_a_requirement[0]
            r.append(tuple(requirement_id))
    elif len(aList) > 1:
        main_sentence  = aList[0]
        requirement_id = main_sentence[0]
        requirement_subset = aList[1:]
        if all([sub.endswith('or') for sub in aList[1:]]):
            for sub_requirement in requirement_subset:
                sub_requirement_id = sub_requirement[0]
                nested_requirement = requirement_id + sub_requirement_id
                r.append(tuple(nested_requirement))
        else:
            for sub_requirement in requirement_subset:
                sub_requirement_id = sub_requirement[0]
                requirement_id    += sub_requirement_id
            r.append(tuple(requirement_id))

print(r)

[('1',), ('2', 'a'), ('2', 'b'), ('2', 'c'), ('3', 'a', 'b')]


### Get eduation years
I haven't done it yet, but I think this task is fairly easy.

### Get school type
Not too hard

### Get education major
Idea: build a list of majors and just look things up

### Get experience length
Idea: Build a dictionary such as,
{'one': 1, 'two': two,...} and look up the word in each sentence. So shouldn't be too hard

### Get full time part time
Isn't that "annual" means full time?

### Get experience job class title
See Notebook `Objective1_a`