## Which jobs grew in pay after adjusting for inflation. 

In [1]:
import sqlite3
import os
import pandas as pd

In [2]:
occupation_file_name = os.path.join('Occupation_Data.csv')

In [3]:
occupation = pd.read_csv(occupation_file_name, index_col=None)

In [4]:
occupation.head()

Unnamed: 0,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_PRSE,H_MEAN,A_MEAN,MEAN_PRSE,H_PCT10,H_PCT25,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,YEAR
0,00-0000,All Occupations,total,652100.0,1.3,21.99,45740.0,1.5,9.18,11.9,17.47,26.74,38.71,19100.0,24760.0,36350.0,55610.0,80510.0,2018
1,11-0000,Management Occupations,major,31180.0,2.6,49.67,103320.0,1.5,19.9,28.6,42.37,61.46,87.13,41400.0,59490.0,88120.0,127850.0,181230.0,2018
2,11-1011,Chief Executives,detailed,1210.0,6.1,73.53,152940.0,2.8,28.36,41.72,61.92,95.4,,58980.0,86770.0,128800.0,198420.0,,2018
3,11-1021,General and Operations Managers,detailed,10360.0,4.4,48.99,101900.0,2.1,17.27,24.89,38.13,61.21,96.22,35920.0,51770.0,79310.0,127320.0,200130.0,2018
4,11-2011,Advertising and Promotions Managers,detailed,60.0,16.8,53.04,110320.0,10.4,18.64,26.74,44.61,78.86,98.1,38770.0,55630.0,92780.0,164030.0,204050.0,2018


In economics the term real dollars is the value of currency after being adjusted for inflation. I will turn everything into 2018 dollars and to compute the difference I used an inflation calculator which can be found in the following website 
https://www.usinflationcalculator.com/

Adjusted for inflation, 1.00 in 2003 is equal to 1.36 in 2018, with a 36.5% cumulative rate of inflation.

Adjusted for inflation, 1.00 in 2008 is equal to 1.17 in 2018, with a 16.6% cumulative rate of inflation.

Adjusted for inflation, 1.00 in 2013 is equal to 1.08 in 2018, with a 7.8% cumulative rate of inflation.

The following sections will need to be adjusted based on their year. 
H_PCT10, H_PCT25, H_MEDIAN, H_CT75, H_PCT90, A_PCT10, A_PCT25, A_MEDIAN, A_PCT75, A_PCT90

In [5]:
year2018 = occupation['YEAR'] == 2018
year2013 = occupation['YEAR'] == 2013
year2008 = occupation['YEAR'] == 2008
year2003 = occupation['YEAR'] == 2003

## Converting to 2018 dollars

In [6]:
occupation.columns[5:18]

for x in occupation.columns[5:18]:
    occupation.loc[(occupation.YEAR == 2013), x] *=1.08
    occupation.loc[(occupation.YEAR == 2008), x] *=1.17
    occupation.loc[(occupation.YEAR == 2003), x] *=1.36


## Checking to see how many occupations existed across all four sample years

In [7]:
occ_count=occupation.groupby('OCC_CODE')
occ_count_all = occ_count.size()
occ_count_all.value_counts()

4    384
2    139
3    128
1     80
5      5
dtype: int64

## Found an error in the data I need to find out where it is and remove it. (Skip to next section if you wish to skip this)
It is odd that there is a 5 in the options I need to figure out which year(s) have duplicate occupations.

In [8]:
dupe_year=occupation.groupby(['OCC_CODE', 'YEAR']).size() > 1
print(dupe_year.value_counts())
dupe_year_list=dupe_year[dupe_year == True]
dupe_year_list

False    2293
True        5
dtype: int64


OCC_CODE  YEAR
53-7061   2003    True
53-7062   2003    True
53-7063   2003    True
53-7064   2003    True
53-7081   2003    True
dtype: bool

2003 Seems to have produced all of the duplicates and here is the list of duplicate job codes 

I am going to have to remove the duplicate data from the dataset

In [9]:
dupes_year = occupation[occupation.duplicated(('OCC_CODE', 'YEAR'), keep=False)]

In [10]:
dupes_year.sort_values('OCC_CODE')

Unnamed: 0,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_PRSE,H_MEAN,A_MEAN,MEAN_PRSE,H_PCT10,H_PCT25,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,YEAR
2292,53-7061,Cleaners of vehicles and equipment,,1260.0,15.7,13.1784,27404.0,6.664,8.1192,9.3432,11.5736,15.3408,19.6656,16891.2,19434.4,24085.6,31905.6,40908.8,2003
2298,53-7061,Cleaners of vehicles and equipment,,1320.0,12.4,13.9264,28954.4,10.336,8.0784,9.2344,11.5464,16.116,24.1672,16796.0,19203.2,24017.6,33524.0,50252.0,2003
2293,53-7062,"Laborers and freight, stock, and material move...",,12700.0,5.6,15.3,31824.0,6.12,9.5608,11.22,13.4912,16.4016,26.5336,19883.2,23337.6,28070.4,34122.4,55188.8,2003
2299,53-7062,"Laborers and freight, stock, and material move...",,12600.0,9.3,15.6672,32572.0,5.304,9.5336,11.4512,14.008,17.952,26.18,19842.4,23813.6,29131.2,37345.6,54454.4,2003
2294,53-7063,Machine feeders and offbearers,,1650.0,29.6,17.0952,35550.4,5.848,11.0432,13.0016,14.9736,20.4136,24.3304,22956.8,27036.8,31130.4,42445.6,50605.6,2003
2300,53-7063,Machine feeders and offbearers,,1410.0,29.9,17.068,35509.6,6.936,10.7848,12.6888,14.8512,20.4272,24.2624,22426.4,26397.6,30885.6,42486.4,50456.0,2003
2295,53-7064,"Packers and packagers, hand",,4970.0,11.3,11.9952,24956.0,2.312,8.2008,9.4248,11.3832,13.9264,17.5168,17054.4,19611.2,23691.2,28968.0,36420.8,2003
2301,53-7064,"Packers and packagers, hand",,4650.0,9.0,12.1448,25255.2,3.128,8.228,9.3432,11.3016,14.1984,18.2784,17095.2,19434.4,23514.4,29525.6,38025.6,2003
2296,53-7081,Refuse and recyclable material collectors,,400.0,29.9,16.1976,33687.2,6.936,10.4992,12.852,15.912,18.8088,22.6032,21855.2,26724.0,33088.8,39127.2,47001.6,2003
2302,53-7081,Refuse and recyclable material collectors,,420.0,29.0,15.7488,32748.8,8.024,10.7032,12.2672,14.5384,18.4824,22.1816,22249.6,25513.6,30232.8,38460.8,46131.2,2003


Much to my suprise the data is different. I suppose data gathering methods were not refined in 2003. To correct this I will merge the data and take the average. Once I get the averages I will add this to a new data base and remove all other duplicates. 

In [11]:
#Had to change the as_index to False so it would create number indexs rather than use the OOC_code
combine = dupes_year.groupby(['OCC_CODE', 'OCC_TITLE', 'YEAR'], as_index=False).mean()
combine = combine.copy()

In [12]:
combine['OCC_GROUP']=pd.np.NaN

In [13]:
new_occupation= occupation.append(combine, ignore_index=True, sort=False)

In [14]:
new_occupation.drop_duplicates(('OCC_CODE', 'YEAR'), keep='last', inplace=True)

In [15]:
occ_count=new_occupation.groupby('OCC_CODE')
occ_count_all = occ_count.size()
occ_count_4=occ_count.size() == 4
occ_count_3=occ_count.size() == 3
occ_count_2=occ_count.size() == 2
occ_count_1=occ_count.size() == 1

In [16]:
print(occ_count_all.value_counts())

4    389
2    139
3    128
1     80
dtype: int64


The problem has been solved. I know this becasue the 5 group no longer exists and the 4 group increased in size by five (the same number as the old 5 group). 
There are 736 unique occupations on this list of those occupations 389 occupations existed all four observed years, 128 existed for three observed years, 139 exitsed for two observed years, and 80 existed for only one observed year. 

In [17]:
total_instance=new_occupation.groupby(['OCC_CODE'])
total_instance=total_instance.size()
dict4={}
dict3={}
dict2={}
dict1={}
for y, x in total_instance.items():
    if x == 4:
        dict4.update({y:x})
    elif x == 3:
        dict3.update({y:x})
    elif x == 2:
        dict2.update({y:x})
    else:
        dict1.update({y:x})   

In [18]:
dict4

{'00-0000': 4,
 '11-0000': 4,
 '11-1011': 4,
 '11-1021': 4,
 '11-2011': 4,
 '11-2021': 4,
 '11-2022': 4,
 '11-2031': 4,
 '11-3011': 4,
 '11-3021': 4,
 '11-3051': 4,
 '11-3061': 4,
 '11-3071': 4,
 '11-9021': 4,
 '11-9031': 4,
 '11-9032': 4,
 '11-9041': 4,
 '11-9051': 4,
 '11-9061': 4,
 '11-9081': 4,
 '11-9111': 4,
 '11-9121': 4,
 '11-9131': 4,
 '11-9141': 4,
 '11-9151': 4,
 '13-0000': 4,
 '13-1031': 4,
 '13-1041': 4,
 '13-1051': 4,
 '13-1071': 4,
 '13-1111': 4,
 '13-1121': 4,
 '13-2011': 4,
 '13-2021': 4,
 '13-2031': 4,
 '13-2041': 4,
 '13-2051': 4,
 '13-2052': 4,
 '13-2053': 4,
 '13-2061': 4,
 '13-2071': 4,
 '13-2072': 4,
 '13-2081': 4,
 '13-2082': 4,
 '15-0000': 4,
 '15-2031': 4,
 '15-2041': 4,
 '17-0000': 4,
 '17-1011': 4,
 '17-1012': 4,
 '17-1021': 4,
 '17-1022': 4,
 '17-2041': 4,
 '17-2051': 4,
 '17-2061': 4,
 '17-2071': 4,
 '17-2072': 4,
 '17-2081': 4,
 '17-2111': 4,
 '17-2141': 4,
 '17-3011': 4,
 '17-3012': 4,
 '17-3013': 4,
 '17-3023': 4,
 '17-3025': 4,
 '17-3026': 4,
 '17-3027'

In [19]:
check=(new_occupation.groupby(['OCC_CODE','YEAR'])).size()

In [20]:
occ_yeardict={}
for y,x in check.items():
    (a,b)=y
    if a in occ_yeardict:
        t=occ_yeardict[a]
        t = t + (b,)
        occ_yeardict.update({a:t})
    else:
        occ_yeardict.update({a:(b,)})
    

In [21]:
occ_yeardict

{'00-0000': (2003, 2008, 2013, 2018),
 '11-0000': (2003, 2008, 2013, 2018),
 '11-1011': (2003, 2008, 2013, 2018),
 '11-1021': (2003, 2008, 2013, 2018),
 '11-1031': (2003, 2008, 2013),
 '11-2011': (2003, 2008, 2013, 2018),
 '11-2021': (2003, 2008, 2013, 2018),
 '11-2022': (2003, 2008, 2013, 2018),
 '11-2031': (2003, 2008, 2013, 2018),
 '11-3011': (2003, 2008, 2013, 2018),
 '11-3021': (2003, 2008, 2013, 2018),
 '11-3031': (2008, 2013, 2018),
 '11-3040': (2003,),
 '11-3041': (2008,),
 '11-3042': (2008,),
 '11-3049': (2008,),
 '11-3051': (2003, 2008, 2013, 2018),
 '11-3061': (2003, 2008, 2013, 2018),
 '11-3071': (2003, 2008, 2013, 2018),
 '11-3111': (2013, 2018),
 '11-3121': (2013, 2018),
 '11-3131': (2013, 2018),
 '11-9021': (2003, 2008, 2013, 2018),
 '11-9031': (2003, 2008, 2013, 2018),
 '11-9032': (2003, 2008, 2013, 2018),
 '11-9033': (2008, 2013, 2018),
 '11-9039': (2008, 2013, 2018),
 '11-9041': (2003, 2008, 2013, 2018),
 '11-9051': (2003, 2008, 2013, 2018),
 '11-9061': (2003, 2008, 2

In [22]:
#for each year look at the number of times mentioned
gapdict={}
for y,x in occ_yeardict.items():
    a=pd.np.diff(x)
    if (5 not in a) and (len(a) != 0):
        a = a[0]
        gapdict.update({y:(a,x)})
gapdict

{'13-1011': (15, (2003, 2018)),
 '19-1012': (10, (2008, 2018)),
 '19-1029': (10, (2008, 2018)),
 '19-2021': (10, (2008, 2018)),
 '21-1029': (10, (2008, 2018)),
 '25-1054': (10, (2003, 2013)),
 '25-2032': (10, (2008, 2018)),
 '25-9021': (10, (2008, 2018)),
 '27-1013': (10, (2008, 2018)),
 '27-1014': (15, (2003, 2018)),
 '27-4014': (10, (2003, 2013)),
 '29-2053': (15, (2003, 2018)),
 '39-5011': (15, (2003, 2018)),
 '41-9021': (15, (2003, 2018)),
 '41-9091': (10, (2003, 2013)),
 '45-2011': (10, (2008, 2018)),
 '45-2041': (10, (2008, 2018)),
 '47-2011': (10, (2008, 2018)),
 '49-2021': (10, (2003, 2013)),
 '51-4012': (10, (2008, 2018)),
 '51-4051': (10, (2008, 2018)),
 '51-6051': (10, (2003, 2013)),
 '51-6062': (15, (2003, 2018)),
 '51-8012': (10, (2003, 2013)),
 '51-8013': (15, (2003, 2018)),
 '51-9071': (10, (2003, 2013)),
 '51-9082': (10, (2008, 2018)),
 '53-1011': (10, (2008, 2018)),
 '53-7121': (10, (2003, 2013))}

In [23]:
len(gapdict)

29

In [31]:
check= new_occupation["OCC_CODE"] == '13-1011'
new_occupation[check]

Unnamed: 0,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_PRSE,H_MEAN,A_MEAN,MEAN_PRSE,H_PCT10,H_PCT25,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,YEAR
33,13-1011,"Agents and Business Managers of Artists, Perfo...",detailed,30.0,46.2,26.21,54520.0,23.5,11.3,14.18,19.64,38.79,45.94,23510.0,29490.0,40860.0,80670.0,95560.0,2018
1825,13-1011,"Agents and business managers of artists, perfo...",,40.0,41.8,18.768,39032.0,20.808,12.5528,13.4232,14.8784,22.8072,28.9544,26112.0,27920.8,30940.0,47436.8,60234.4,2003


In [25]:
#new_occupation['YEAR'] = pd.to_datetime(new_occupation['YEAR'], format = "%Y")

In [26]:
new_occupation.to_sql("occ_table",sqlite3.connect('occ.db'), if_exists ="replace")

In [27]:
con = sqlite3.connect("occ.db")

In [28]:
occupation2 = pd.read_sql_query("SELECT * FROM occ_table WHERE YEAR=2003 AND OCC_CODE IN ('53-7061', '53-7062', '53-7063', '53-7064', '53-7081') ORDER BY OCC_CODE ASC", con)
occupation2

Unnamed: 0,index,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_PRSE,H_MEAN,A_MEAN,MEAN_PRSE,H_PCT10,H_PCT25,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,YEAR
0,2303,53-7061,Cleaners of vehicles and equipment,,1290.0,14.05,13.5524,28179.2,8.5,8.0988,9.2888,11.56,15.7284,21.9164,16843.6,19318.8,24051.6,32714.8,45580.4,2003
1,2304,53-7062,"Laborers and freight, stock, and material move...",,12650.0,7.45,15.4836,32198.0,5.712,9.5472,11.3356,13.7496,17.1768,26.3568,19862.8,23575.6,28600.8,35734.0,54821.6,2003
2,2305,53-7063,Machine feeders and offbearers,,1530.0,29.75,17.0816,35530.0,6.392,10.914,12.8452,14.9124,20.4204,24.2964,22691.6,26717.2,31008.0,42466.0,50530.8,2003
3,2306,53-7064,"Packers and packagers, hand",,4810.0,10.15,12.07,25105.6,2.72,8.2144,9.384,11.3424,14.0624,17.8976,17074.8,19522.8,23602.8,29246.8,37223.2,2003
4,2307,53-7081,Refuse and recyclable material collectors,,410.0,29.45,15.9732,33218.0,7.48,10.6012,12.5596,15.2252,18.6456,22.3924,22052.4,26118.8,31660.8,38794.0,46566.4,2003
