## Imports Dataset ##

In [None]:
library(plyr)
library(dplyr)
library(zoo)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(corrplot)
library(tidyverse)
library(leaps)
library(car)

# FAMSIZE, AGE, MARRNO, UHRSWORK, INCWAGE, OCCSCORE are continuous
census = read.table("usa_2018.csv", sep = ",", header = TRUE)
census_clean = read.table("cleaned_acs_2018.csv", sep = ",", header = TRUE)
#Filter
CO = filter(census, STATEFIP == "8")
CO_Cleaned = filter(census_clean, STATEFIP == "Colorado")
head(CO)
head(CO_Cleaned)

## Cleaning Data Set ##

In [16]:
CO$MARST = CO_Cleaned$MARST
CO$CLASSWKR = CO_Cleaned$CLASSWKR
CO$DIFFSENS = CO_Cleaned$DIFFSENS
CO$SEX = CO_Cleaned$SEX

CO$RACE[CO$RACE == 1] = "White"
CO$RACE[CO$RACE == 2] = "Black/African American"
CO$RACE[CO$RACE == 3] = "American Indian or Alaska Native"
CO$RACE[CO$RACE == 4] = "Asian American or Pacific Islander"
CO$RACE[CO$RACE == 5] = "Asian American or Pacific Islander"
CO$RACE[CO$RACE == 6] = "Asian American or Pacific Islander"
CO$RACE[CO$RACE == 7] = "Other Race"
CO$RACE[CO$RACE == 8] = "Other Race"
CO$RACE[CO$RACE == 9] = "Other Race"

CO$EDUC[CO$EDUC == 0] = "High School or Less"
CO$EDUC[CO$EDUC == 1] = "High School or Less"
CO$EDUC[CO$EDUC == 2] = "High School or Less"
CO$EDUC[CO$EDUC == 3] = "High School or Less"
CO$EDUC[CO$EDUC == 4] = "High School or Less"
CO$EDUC[CO$EDUC == 5] = "High School or Less"
CO$EDUC[CO$EDUC == 6] = "High School or Less"
CO$EDUC[CO$EDUC == 7] = "College or More"
CO$EDUC[CO$EDUC == 8] = "College or More"
CO$EDUC[CO$EDUC == 9] = "College or More"
CO$EDUC[CO$EDUC == 10] = "College or More"
CO$EDUC[CO$EDUC == 11] = "College or More"

CO$YRSUSA1[CO$YRSUSA1 == 0] = NA

co = select(CO, -c(YEAR, STATEFIP, RACED, CLASSWKRD, EDUCD, POVERTY))
head(co)



ERROR: Error in hist.default(co$AGE, main = "Age", xlab = "age"): 'x' must be numeric


## Exploratory Analysis ##

In [None]:
# Exploratory Data Analysis

# Histograms for numeric values
age_hist <- ggplot(co, aes(x = co$AGE)) +
    geom_histogram(color="grey3", alpha=0.6, stat = 'count') +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + 
    scale_fill_manual(values=c("#69b3a2")) + xlab("Age") + 
    ylab("Frequency") + labs(fill = "") + ggtitle("") + theme(legend.position = "none")

famsize_hist <- ggplot(co, aes(x = co$FAMSIZE)) +
    geom_histogram(color="grey3", alpha=0.6, stat = 'count') +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + 
    scale_fill_manual(values=c("#69b3a2")) + xlab("Family Size") + 
    ylab("Frequency") + labs(fill = "") + ggtitle("") + theme(legend.position = "none")

hours_hist <- ggplot(co, aes(x = co$UHRSWORK)) +
    geom_histogram(color="grey3", alpha=0.6, stat = 'count') +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + 
    scale_fill_manual(values=c("#69b3a2")) + xlab("Hours Worked per Week") + 
    ylab("Frequency") + labs(fill = "") + ggtitle("") + theme(legend.position = "none")

edu_hist <- ggplot(co, aes(x = co$EDUC)) +
    geom_histogram(color="grey3", alpha=0.6, stat = 'count') +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + 
    scale_fill_manual(values=c("#69b3a2")) + xlab("Educational Attainment") + 
    ylab("Frequency") + labs(fill = "") + ggtitle("") + theme(legend.position = "none")

inc_hist <- ggplot(co, aes(x = co$INCWAGE)) +
    geom_histogram(color="grey3", alpha=0.6, stat = 'count') +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + 
    scale_fill_manual(values=c("#69b3a2")) + xlab("Total Individual Income/Wage") + 
    ylab("Frequency") + labs(fill = "") + ggtitle("") + theme(legend.position = "none")

occ_hist <- ggplot(co, aes(x = co$OCCSCORE)) +
    geom_histogram(color="grey3", alpha=0.6, stat = 'count') +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + 
    scale_fill_manual(values=c("#69b3a2")) + xlab("Occupational Score") + 
    ylab("Frequency") + labs(fill = "") + ggtitle("") + theme(legend.position = "none")

grid.arrange(inc_hist, occ_hist, age_hist, famsize_hist, hours_hist, edu_hist, nrow=3)

## AIC and BIC Graphs ##

In [None]:


# corr = cor(model.matrix(lmod_co)[,-1])

n = dim(co)[1]; 
regCO = regsubsets(INCWAGE ~ ., data = co, really.big=T)
rs = summary(regCO)
rs$which

AIC = 2*(2:9) + n*log(rs$rss/n)
plot(AIC ~ I(1:8), xlab = 'number of predictors', ylab = 'AIC', main = "AIC Graph")

# Best predictors in terms of AIC
# 8 predictors - SEX, AGE, MARRNO, EDUC, CLASSWKR, UHRSWORK, OCCSCORE, DIFFSENS
# 7 predictors - SEX, AGE, EDUC, CLASSWKR, UHRSWORK, OCCSCORE, DIFFSENS
# 6 predictors - AGE, EDUC, CLASSWKR, UHRSWORK, OCCSCORE, DIFFSENS

BIC = log(n)*(2:9) + n*log(rs$rss/n) 
plot(BIC ~ I(1:8), xlab = "number of predictors", ylab = "BIC", main = "BIC")

# Best predictors in terms of BIC
# 8 predictors - SEX, AGE, MARRNO, EDUC, CLASSWKR, UHRSWORK, OCCSCORE, DIFFSENS
# 7 predictors - SEX, AGE, EDUC, CLASSWKR, UHRSWORK, OCCSCORE, DIFFSENS
# 6 predictors - AGE, EDUC, CLASSWKR, UHRSWORK, OCCSCORE, DIFFSENS



## Linear Model with Age only as predictor ##

In [None]:
# FAMSIZE, AGE, MARRNO, UHRSWORK, INCWAGE, OCCSCORE are continuous
lm_cont = lm(INCWAGE ~ AGE , co)

p1 = ggplot(co, aes(y = INCWAGE, x = AGE)) + 
    geom_point(alpha = 0.3) + 
    geom_smooth(se = F, col = "#CFB87C") + 
    geom_abline(intercept = 0, slope = 1) + 
    xlab("AGE") + 
    ylab("Income") + 
    theme_bw()

p2 = ggplot(co, aes(y = INCWAGE, x = UHRSWORK)) + 
    geom_point(alpha = 0.3) + 
    geom_smooth(se = F, col = "#CFB87C") + 
    geom_abline(intercept = 0, slope = 1, col = 'grey') + 
    xlab("AGE") + 
    ylab("Income") + 
    theme_bw()
grid.arrange(p1, p2, nrow=1)

## Linear models testing different number of predictors ##

In [None]:
lm_1 = lm(INCWAGE ~ SEX+ AGE + MARRNO + EDUC + CLASSWKR + UHRSWORK + OCCSCORE + DIFFSENS + RACE, co)
lm_2 = lm(INCWAGE ~ SEX+ AGE + EDUC + CLASSWKR + UHRSWORK + OCCSCORE + DIFFSENS, co)
lm_3 = lm(INCWAGE ~ AGE + EDUC + CLASSWKR + UHRSWORK + OCCSCORE + DIFFSENS , co)

anova(lm_1, lm_2)
anova(lm_2, lm_3)

lm_diag1 = data.frame(yhat = fitted(lm_1), r = resid(lm_1), y = co$INCWAGE)
lm_diag2 = data.frame(yhat = fitted(lm_2), r = resid(lm_2), y = co$INCWAGE)
lm_diag3 = data.frame(yhat = fitted(lm_3), r = resid(lm_3), y = co$INCWAGE)

options(repr.plot.width = 6, repr.plot.width = 6)
lm1 = ggplot(lm_diag1, aes(x = y, y = yhat)) + 
    geom_point(alpha =0.5, col = 'deepskyblue') + 
    geom_smooth(se = F, col = "#CFB87C") + 
    xlab("Observed") + 
    ylab("Fitted") + 
    ggtitle("Linear Model 1") +
    theme_bw()

lm2 = ggplot(lm_diag2, aes(x = y, y = yhat)) + 
    geom_point(alpha =0.5, col = 'deepskyblue') + 
    geom_smooth(se = F, col = "#CFB87C") + 
    xlab("Observed") + 
    ylab("Fitted") + 
    ggtitle("Linear Model 2") +
    theme_bw()

lm3 = ggplot(lm_diag3, aes(x = y, y = yhat)) + 
    geom_point(alpha =0.5, col = 'deepskyblue') + 
    geom_smooth(se = F, col = "#CFB87C") + 
    xlab("Observed") + 
    ylab("Fitted") + 
    ggtitle("Linear Model 3") +
    theme_bw()


res1 = ggplot(lm_diag1, aes(x = yhat, y = r)) + 
    geom_point(alpha = 0.5) + 
    geom_smooth(se = F, col = "#CFB87C") + 
    geom_abline(intercept = 0, col = 'red') + 
    xlab("Fitted Values") + 
    ggtitle("Linear Model 1") +
    ylab("Residuals")
res2 = ggplot(lm_diag2, aes(x = yhat, y = r)) + 
    geom_point(alpha = 0.5) + 
    geom_smooth(se = F, col = "#CFB87C") + 
    geom_abline(intercept = 0, col = 'red') + 
    xlab("Fitted Values") + 
    ggtitle("Linear Model 2") +
    ylab("Residuals")
res3 = ggplot(lm_diag3, aes(x = yhat, y = r)) + 
    geom_point(alpha = 0.5) + 
    geom_smooth(se = F, col = "#CFB87C") + 
    geom_abline(intercept = 0, col = 'red') + 
    xlab("Fitted Values") +
    ggtitle("Linear Model 3") +
    ylab("Residuals")

grid.arrange(lm1, lm2, lm3, nrow=2)
grid.arrange(res1, res2, res3, nrow = 2)

## Colinearity Checks ##

In [None]:
vif(lm_co)
kappa(lm_co)
cor(model.matrix(lm_co)[,-1])

$$Classwork \Rightarrow Age \\
Age \Rightarrow Married \\ 
Works wage \Rightarrow Self Employed \\
UhrsWork \Rightarrow Work wage \\
Occscore \Rightarrow Work wages \\ 
Age \Rightarrow UhrsWork \\ 
Occsocre \Rightarrow Uhrswork \\
Race \Rightarrow Race$$