# Idea for building a dataframe

In [2]:
# Import relevant modules
import numpy as np
import pandas as pd

## Approach 1

In [3]:
# Here's my list of requirement set together with their requirement subset
r = [('1',), ('2', 'a'), ('2', 'b'), ('2', 'c'), ('3', 'a', 'b')]

In [4]:
# Try calling pd.DataFrame on it and see the results
pd.DataFrame(r)

Unnamed: 0,0,1,2
0,1,,
1,2,a,
2,2,b,
3,2,c,
4,3,a,b


Note 1: At a first glance, it seems that this is what we want. However, after carefull consideration, we really want things like, `FILE_NAME`, `JOB_CLASS_TITLE`, etc. first. In Python, there seems not an easy way to rearrange the columns like, *SELECT [column 1, column 2] FROM*, as in SQL, so we have to abandon this approach.

## Approach 2

In [5]:
# Let's try creating only one column
pd.DataFrame(index=5)

TypeError: Index(...) must be called with a collection of some kind, 5 was passed

In [6]:
# TypeError: Index(...) must be called with a collection of some kind, 5 was passed
# Let's do it again
pd.DataFrame(index=[5, 7])

5
7


In [9]:
# Looks good. Let's do it again
pd.DataFrame(index=range(5), columns=['FILE_NAME'])

Unnamed: 0,FILE_NAME
0,
1,
2,
3,
4,


In [11]:
# Much better! Can it automatically populate?
pd.DataFrame({'col1': [2,3], 'col2': ['x']})

ValueError: arrays must all be same length

In [12]:
# ValueError: arrays must all be same length
# Ok, we can fix this with the multiplication tricks
pd.DataFrame({'col1': [3, 5], 'col2': ['x']*2})

Unnamed: 0,col1,col2
0,3,x
1,5,x


## Approach 3

In [13]:
# Let's build a dictionary of field names
k = {'FILE_NAME': None,
     'JOB_CLASS_TITLE': None
    }
k

{'FILE_NAME': None, 'JOB_CLASS_TITLE': None}

In [16]:
# Then fill this dictionary with the multiplication trick we just learned
k['FILE_NAME'] = ['x']*5
k

{'FILE_NAME': ['x', 'x', 'x', 'x', 'x'], 'JOB_CLASS_TITLE': None}

In [17]:
# Finally, let's convert this into a dataframe
pd.DataFrame(k)

Unnamed: 0,FILE_NAME,JOB_CLASS_TITLE
0,x,
1,x,
2,x,
3,x,
4,x,


In [21]:
for key, values in k.items():
    print(key, values)

FILE_NAME ['x', 'x', 'x', 'x', 'x']
JOB_CLASS_TITLE None


## Conclusion
Build a giant dictionary and hopefully, things will work.

In [28]:
field_name_dict = {'FILE_NAME': None, 
                   'JOB_CLASS_TITLE': None, 
                   'JOB_CLASS_NO': None,
                   'REQUIREMENT_SET_ID': None,
                   'REQUIREMENT_SUBSET_ID': None, 
                   'JOB_DUTIES': None,
                   'EDUCATION_YEARS': None,
                   'SCHOOL_TYPE': None,
                   'EDUCATION_MAJOR': None,
                   'EXPERIENCE_LENGTH': None,
                   'FULL_TIME_PART_TIME': None,
                   'EXP_JOB_CLASS_TITLE': None,
                   'EXP_JOB_CLASS_ALT_RESP': None,
                   'EXP_JOB_CLASS_FUNCTION': None,
                   'COURSE_COUNT': None,
                   'COURSE_LENGTH': None,
                   'COURSE_SUBJECT': None,
                   'MISC_COURSE_DETAILS': None,
                   'DRIVERS_LICENSE_REQ': None,
                   'DRIV_LIC_TYPE': None,
                   'ADDTL_LIC': None,
                   'EXAM_TYPE': None,
                   'ENTRY_SALARY_GEN': None,
                   'ENTRY_SALARY_DWP': None,
                   'OPEN_DATE': None
                  }
# Sanity check
print(len(field_name_dict)) # should be 25

25
