In [1]:
import numpy as np
import pandas as pd

## Read the `Wage` File

In [2]:
wage_df = pd.read_csv("wage.csv")

In [3]:
wage_df.head(3)

Unnamed: 0,year,age,maritl,race,education,region,jobclass,health,health_ins,logwage,wage
0,2006,18,1. Never Married,1. White,1. < HS Grad,2. Middle Atlantic,1. Industrial,1. <=Good,2. No,4.318063,75.043154
1,2004,24,1. Never Married,1. White,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,2. No,4.255273,70.47602
2,2003,45,2. Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,1. <=Good,1. Yes,4.875061,130.982177


In [4]:
wage_df.dtypes #we can see that everything except for year age (log) wage are categorical

year            int64
age             int64
maritl         object
race           object
education      object
region         object
jobclass       object
health         object
health_ins     object
logwage       float64
wage          float64
dtype: object

### Check unique values in the `jobclass` column

In [5]:
wage_df.jobclass.unique() #only two -- easy to replace

array(['1. Industrial', '2. Information'], dtype=object)

In [6]:
wage_df["job_information"] =  (wage_df["jobclass"] == "2. Information").astype(int) #so that =1 means information

In [7]:
wage_df.drop(['jobclass', 'logwage', 'region', 'year'], axis=1, inplace=True) #lets delete jobclass and logwage now

### Check unique values in the `health` column

In [8]:
wage_df.health.unique() #only two -- easy to replace

array(['1. <=Good', '2. >=Very Good'], dtype=object)

In [9]:
wage_df["health"] =  (wage_df["health"] == "2. >=Very Good").astype(int) #so that =1 means very good health

### Apply the same for `health_ins`

In [10]:
wage_df.health_ins.unique() #only two -- easy to replace

array(['2. No', '1. Yes'], dtype=object)

In [11]:
wage_df["health_ins"] =  (wage_df["health_ins"] == "1. Yes").astype(int) #so that =1 means has a health insurance

### Check unique values in the `maritl` column

In [12]:
wage_df.maritl.unique() #we cannot make this 1-2-3-4-5 as this is nominal

array(['1. Never Married', '2. Married', '4. Divorced', '3. Widowed',
       '5. Separated'], dtype=object)

In [13]:
one_hot = pd.get_dummies(wage_df.maritl, prefix='marriage')

In [14]:
wage_df = wage_df.join(one_hot)

In [15]:
wage_df.drop(['maritl', 'marriage_1. Never Married'], axis=1, inplace=True)

In [16]:
wage_df.columns = [*wage_df.columns[:-4], 'marriage_yes',\
                   'marriage_widowed', 'marriage_divorced', 'marriage_separated'] #we drop one -- why?

In [17]:
wage_df.columns[:-4] #it looks like we can take these as ordinal categories

Index(['age', 'race', 'education', 'health', 'health_ins', 'wage',
       'job_information'],
      dtype='object')

### Decide what to do for `education`

In [18]:
wage_df.education.unique() #it looks like we can take these as ordinal categories

array(['1. < HS Grad', '4. College Grad', '3. Some College', '2. HS Grad',
       '5. Advanced Degree'], dtype=object)

In [19]:
wage_df.education = (wage_df.education.astype(str).str[0]).astype(int)

### Question to deliver on encoding categorical variables
- Encode the `race` variable by using numerical variable(s) and drop `race`.
- Now that all the variables in our dataframe is numerical, apply a linear regression model to explain the `wage` with the other variables. Report the coefficients.
- Interpret the coefficient of the `education` varaible.

In [20]:
wage_df.race.unique() #it looks like we can take these as nominal categories

array(['1. White', '3. Asian', '4. Other', '2. Black'], dtype=object)

In [21]:
one_hot = pd.get_dummies(wage_df.race, prefix='race')
one_hot

Unnamed: 0,race_1. White,race_2. Black,race_3. Asian,race_4. Other
0,True,False,False,False
1,True,False,False,False
2,True,False,False,False
3,False,False,True,False
4,True,False,False,False
...,...,...,...,...
2995,True,False,False,False
2996,True,False,False,False
2997,False,True,False,False
2998,True,False,False,False


In [22]:
wage_df = wage_df.join(one_hot)

In [23]:
wage_df = wage_df.drop('race',axis=1)

In [24]:
wage_df.columns = [*wage_df.columns[:-4], 'race_White',\
                   'race_Black', 'race_Asian', 'race_Other'] 

In [25]:
wage_df.head()

Unnamed: 0,age,education,health,health_ins,wage,job_information,marriage_yes,marriage_widowed,marriage_divorced,marriage_separated,race_White,race_Black,race_Asian,race_Other
0,18,1,0,0,75.043154,0,False,False,False,False,True,False,False,False
1,24,4,1,0,70.47602,1,False,False,False,False,True,False,False,False
2,45,3,0,1,130.982177,0,True,False,False,False,True,False,False,False
3,43,4,1,1,154.685293,1,True,False,False,False,False,False,True,False
4,50,2,0,1,75.043154,1,False,False,True,False,True,False,False,False


#### First encode the variable. Then fit a linear model.

In [26]:
from sklearn.linear_model import LinearRegression

In [27]:
Y = wage_df['wage']

In [28]:
X = wage_df.drop('wage',axis=1)

In [29]:
X.columns

Index(['age', 'education', 'health', 'health_ins', 'job_information',
       'marriage_yes', 'marriage_widowed', 'marriage_divorced',
       'marriage_separated', 'race_White', 'race_Black', 'race_Asian',
       'race_Other'],
      dtype='object')

In [30]:
reg = LinearRegression().fit(X,Y) #hint: to get coefficients of the fitted model use "reg.coef_"

In [33]:
reg.coef_

array([ 0.3037476 , 13.34408106,  6.60422963, 16.99604481,  3.78564598,
       17.17994727,  0.97270744,  3.49189903, 12.13254899,  2.84119404,
       -2.30111988,  1.3530328 , -1.89310696])

Coefficients :
array([ 0.3037476 , 13.34408106,  6.60422963, 16.99604481,  3.78564598,
       17.17994727,  0.97270744,  3.49189903, 12.13254899,  2.84119404,
       -2.30111988,  1.3530328 , -1.89310696])


In [34]:
col1 = X.columns
col2 = reg.coef_
# Create a DataFrame
df = pd.DataFrame({'col1': col1, 'col2': col2})

# Print the DataFrame
print(df)

                  col1       col2
0                  age   0.303748
1            education  13.344081
2               health   6.604230
3           health_ins  16.996045
4      job_information   3.785646
5         marriage_yes  17.179947
6     marriage_widowed   0.972707
7    marriage_divorced   3.491899
8   marriage_separated  12.132549
9           race_White   2.841194
10          race_Black  -2.301120
11          race_Asian   1.353033
12          race_Other  -1.893107


A positive coefficient means that as the corresponding feature increases, the predicted target variable also tends to increase. The larger the coefficient, the stronger the effect of that feature on the target variable.
Education with a coefficient of 13.34 has the highest positive effect on wage, meaning higher the level of education, higher the wage