In [1]:
import pandas as pd
import numpy as np

## Import Taxonomies

In [2]:
def read_taxonomies(year):
    df = pd.read_csv('csv_files/Taxonomies/'+str(year)+'_Occupations.csv')
    df.columns = ['onetsoccode', 'onet_title', 'description']
    df.drop('description', axis=1, inplace=True)
    return df

In [3]:
tax_2006 = read_taxonomies(2006)
tax_2009 = read_taxonomies(2009)
tax_2010 = read_taxonomies(2010)

In [4]:
def read_crosswalks(year1, year2):
    df = pd.read_csv('csv_files/Crosswalks/'+str(year1)+'_to_'+str(year2)+'_Crosswalk.csv')
    df.columns = ['temp_code', 'temp_title', 'onetsoccode', 'onet_title']
    df.drop(['temp_code', 'temp_title'], axis=1, inplace=True)
    return df

In [5]:
cross_2006 = read_crosswalks(2000, 2006)
cross_2009 = read_crosswalks(2006, 2009)
cross_2010 = read_crosswalks(2009, 2010)

## Identify New Works

In [6]:
def get_new_work(df1, df2):
    temp = df1.merge(df2, how='left', on='onetsoccode')
    temp.fillna(' ', inplace=True)
    temp = temp[temp.onet_title_y == ' ']
    temp.drop('onet_title_y', axis=1, inplace=True)
    temp.rename(columns={'onet_title_x':'onet_title'}, inplace=True)
    return temp

In [7]:
get_new_work(tax_2006, cross_2006)

Unnamed: 0,onetsoccode,onet_title
97,15-1099.01,Software Quality Assurance Engineers and Testers
98,15-1099.02,Computer Systems Engineers/Architects
99,15-1099.03,Network Designers
100,15-1099.04,Web Developers
101,15-1099.05,Web Administrators
479,33-9099.01,Transportation Security Screeners


In [8]:
new_work1 = get_new_work(tax_2006, cross_2006)
new_work2 = get_new_work(tax_2009, cross_2009)
new_work3 = get_new_work(tax_2010, cross_2010)

In [9]:
df_new_work = new_work1.append(new_work2).append(new_work3)
df_new_work.reset_index(drop=True, inplace=True)
df_new_work.head()

Unnamed: 0,onetsoccode,onet_title
0,15-1099.01,Software Quality Assurance Engineers and Testers
1,15-1099.02,Computer Systems Engineers/Architects
2,15-1099.03,Network Designers
3,15-1099.04,Web Developers
4,15-1099.05,Web Administrators


## Update SOC Codes

In [10]:
df = pd.read_csv('csv_files/Crosswalks/2009_to_2010_Crosswalk.csv')
df.columns = ['onetsoccode2009', 'onet_title', 'onetsoccode', 'onetsoctitle2010']
df.drop(['onetsoccode2009', 'onetsoctitle2010'], axis=1, inplace=True)
df.drop_duplicates('onet_title', inplace=True)
df.head()

Unnamed: 0,onet_title,onetsoccode
0,Chief Executives,11-1011.00
1,Chief Sustainability Officers,11-1011.03
2,General and Operations Managers,11-1021.00
3,Legislators,11-1031.00
4,Advertising and Promotions Managers,11-2011.00


In [11]:
df_final = df_new_work.merge(df, how='left', on='onet_title')
df_final.fillna(' ', inplace=True)

In [12]:
x_values = df_final.onetsoccode_x.values
y_values = df_final.onetsoccode_y.values
for idx, val in enumerate(y_values):
    if (x_values[idx] != val) and (val != ' '):
        x_values[idx] = val

In [13]:
## first update

In [14]:
df_final['onetsoccode_x'] = x_values
df_final.drop('onetsoccode_y', axis=1, inplace=True)
df_final.rename(columns={'onetsoccode_x': 'onetsoccode'}, inplace=True)
df_final.head()

Unnamed: 0,onetsoccode,onet_title
0,15-1199.01,Software Quality Assurance Engineers and Testers
1,15-1199.02,Computer Systems Engineers/Architects
2,15-1143.00,Network Designers
3,15-1134.00,Web Developers
4,15-1199.03,Web Administrators


In [15]:
## second update

In [16]:
df_final = df_final.merge(tax_2010, how='left', on='onet_title')
df_final.fillna(' ', inplace=True

In [17]:
x_values = df_final.onetsoccode_x.values
y_values = df_final.onetsoccode_y.values
for idx, val in enumerate(y_values):
    if (x_values[idx] != val) and (val != ' '):
        x_values[idx] = val

In [18]:
df_final['onetsoccode_x'] = x_values
df_final.drop('onetsoccode_y', axis=1, inplace=True)
df_final.rename(columns={'onetsoccode_x': 'onetsoccode'}, inplace=True)
df_final.head()

Unnamed: 0,onetsoccode,onet_title
0,15-1199.01,Software Quality Assurance Engineers and Testers
1,15-1199.02,Computer Systems Engineers/Architects
2,15-1143.00,Network Designers
3,15-1134.00,Web Developers
4,15-1199.03,Web Administrators


In [19]:
df_final

Unnamed: 0,onetsoccode,onet_title
0,15-1199.01,Software Quality Assurance Engineers and Testers
1,15-1199.02,Computer Systems Engineers/Architects
2,15-1143.00,Network Designers
3,15-1134.00,Web Developers
4,15-1199.03,Web Administrators
...,...,...
168,29-2099.07,Surgical Assistants
169,31-9097.00,Phlebotomists
170,43-3099.00,"Financial Clerks, All Other"
171,49-2021.00,"Radio, Cellular, and Tower Equipment Installer..."


In [20]:
df_final.to_csv('new_work.csv')