In [22]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn
import numpy as np
import scipy
import statsmodels.formula.api as smf 
import statsmodels.stats.multicomp as multi
%matplotlib inline

### in this assignment will examine  the effect of the level of education on the relationship    
### between ethnicity and income level for adults older than 20 years old 

In [39]:
data = pd.read_csv('/home/data-sci/Desktop/analysis/course/nesarc_pds.csv',low_memory=False,
                   usecols=['S1Q10A','S1Q6A','S1Q1D5','S1Q1D3','S1Q1D2','AGE'])

In [40]:
data.head()

Unnamed: 0,AGE,S1Q1D2,S1Q1D3,S1Q1D5,S1Q6A,S1Q10A
0,23,2,2,1,8,17500
1,28,2,2,1,8,11000
2,81,2,2,1,6,6000
3,18,2,2,1,8,27000
4,36,2,1,2,12,42000


In [41]:
data.rename(columns={'S1Q1D2':'asian','S1Q10A':'income','S1Q6A':'edu','S1Q1D3':'black','S1Q1D5':'white','AGE':'age'},inplace=True)
data.dropna(inplace=True)


In [42]:
def ETHNICITY(row):
    if row['asian'] == 1:
        return 'asian'
    if row['white'] == 1:
        return 'white'
    if row['black'] == 1:
        return 'black'

data['ETHNICITY'] = data.apply(lambda row: ETHNICITY(row), axis=1)

In [71]:
data = data[data['age'] > 20 ]
sub1 = data[['edu','income','ETHNICITY']].dropna()

In [72]:
sub1.head()

Unnamed: 0,edu,income,ETHNICITY
0,8,17500,white
1,8,11000,white
2,6,6000,white
4,12,42000,black
5,14,500,black


# ANOVA

In [74]:

mc = multi.MultiComparison(sub1['income'],sub1['ETHNICITY'])
res = mc.tukeyhsd()
print(res.summary())


   Multiple Comparison of Means - Tukey HSD,FWER=0.05  
group1 group2   meandiff     lower      upper    reject
-------------------------------------------------------
asian  black  -10677.0812 -13711.8024 -7642.3601  True 
asian  white   -3562.3477  -6437.6461 -687.0494   True 
black  white   7114.7335   5854.8246  8374.6423   True 
-------------------------------------------------------


In [75]:
#means
sub2 = sub1[['income','ETHNICITY']]
sub2.groupby('ETHNICITY').mean()

Unnamed: 0_level_0,income
ETHNICITY,Unnamed: 1_level_1
asian,34197.045995
black,23519.964777
white,30634.698255


In [77]:
#stranded deviation  
sub2.groupby('ETHNICITY').std()

Unnamed: 0_level_0,income
ETHNICITY,Unnamed: 1_level_1
asian,44083.513812
black,21186.63777
white,46577.784136


#### for simplisity will remap the variable edu to have just 4 levels 
#### below high school education == 0
#### high school == 1
#### collage == 2
#### higher == 3



In [86]:
edu_remap ={1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:1,9:1,10:1,11:1,12:2,13:2,14:3}
sub1['edu'] = sub1['edu'].map(edu_remap)

In [52]:
sub1.head()

Unnamed: 0,edu,income,ETHNICITY
0,1,17500,white
1,1,11000,white
2,0,6000,white
4,2,42000,black
5,3,500,black


In [78]:
#subsitting the data
sub3 = sub1[(sub1['edu'] == 0)]
sub4 = sub1[(sub1['edu'] == 1)]
sub5 = sub1[(sub1['edu'] == 2)]
sub6 = sub1[(sub1['edu'] == 3)]

### the relationship between ethnicity and income for those who have less than high school education

In [58]:
mc = multi.MultiComparison(sub3['income'],sub3['ETHNICITY'])
res = mc.tukeyhsd()
print(res.summary())

  Multiple Comparison of Means - Tukey HSD,FWER=0.05 
group1 group2  meandiff    lower      upper    reject
-----------------------------------------------------
asian  black  -4113.6503 -6935.3738 -1291.9269  True 
asian  white  -1821.2653 -4559.337   916.8064  False 
black  white   2292.385  1346.4277  3238.3424   True 
-----------------------------------------------------


## the relationship between ethnicity and income for those who have high school education

In [79]:
mc = multi.MultiComparison(sub4['income'],sub4['ETHNICITY'])
res = mc.tukeyhsd()
print(res.summary())

 Multiple Comparison of Means - Tukey HSD,FWER=0.05 
group1 group2  meandiff    lower      upper   reject
----------------------------------------------------
asian  black  -652.5517  -6292.9681 4987.8646 False 
asian  white  -1731.9122 -6119.221  2655.3967 False 
black  white  -1079.3604 -5402.5923 3243.8714 False 
----------------------------------------------------


## the relationship between ethnicity and income for those who have  collage education

In [80]:
mc = multi.MultiComparison(sub5['income'],sub5['ETHNICITY'])
res = mc.tukeyhsd()
print(res.summary())

 Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2  meandiff   lower      upper   reject
---------------------------------------------------
asian  black  -212.6526 -9905.1788 9479.8736 False 
asian  white  -433.2505 -9264.0785 8397.5776 False 
black  white  -220.5979 -5035.3914 4594.1957 False 
---------------------------------------------------


### the relationship between ethnicity and income for those who have masters degree or higher education

In [81]:
mc = multi.MultiComparison(sub6['income'],sub6['ETHNICITY'])
res = mc.tukeyhsd()
print(res.summary())

 Multiple Comparison of Means - Tukey HSD,FWER=0.05 
group1 group2  meandiff   lower      upper    reject
----------------------------------------------------
asian  black  1743.4968 -5374.6715 8861.6652  False 
asian  white  3413.9708 -3408.2416 10236.1832 False 
black  white   1670.474 -1043.5719 4384.5199  False 
----------------------------------------------------
