In [1]:
dataset = read.csv('50_Startups.csv')
head(dataset)

R.D.Spend,Administration,Marketing.Spend,State,Profit
165349.2,136897.8,471784.1,New York,192261.8
162597.7,151377.59,443898.5,California,191792.1
153441.5,101145.55,407934.5,Florida,191050.4
144372.4,118671.85,383199.6,New York,182902.0
142107.3,91391.77,366168.4,Florida,166187.9
131876.9,99814.71,362861.4,New York,156991.1


In [2]:
dataset$State = factor(dataset$State, 
                      levels = c('NewYork', 'California', 'Florida'),
                      labels = c(1,2,3))

In [3]:
library(caTools)
set.seed(123)
split = sample.split(dataset$Profit, SplitRatio =0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

In [4]:
lreg = lm(formula = Profit~.,
         data = training_set)

In [9]:
summary(lreg)


Call:
lm(formula = Profit ~ ., data = training_set)

Residuals:
   Min     1Q Median     3Q    Max 
-29683  -3656   1916   4969  16362 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      4.428e+04  1.074e+04   4.124 0.000446 ***
R.D.Spend        8.097e-01  6.350e-02  12.752 1.23e-11 ***
Administration  -1.986e-02  7.826e-02  -0.254 0.802007    
Marketing.Spend  5.340e-02  2.818e-02   1.895 0.071324 .  
State3          -1.354e+03  4.247e+03  -0.319 0.752917    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9975 on 22 degrees of freedom
  (13 observations deleted due to missingness)
Multiple R-squared:  0.9518,	Adjusted R-squared:  0.943 
F-statistic: 108.6 on 4 and 22 DF,  p-value: 3.754e-14


#### We don't need to go further for step by step backward elimination as we can see the statistically significant columns here. Still we will give it a try. 

In [8]:
y_pred = predict(lreg, newdata= test_set)

## Building the optimal model using Backward Elimination

In [10]:
regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + State,
               data = dataset)
summary(regressor)


Call:
lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + 
    State, data = dataset)

Residuals:
     Min       1Q   Median       3Q      Max 
-30763.3  -3115.3    318.1   5506.5  16407.6 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      4.655e+04  9.364e+03   4.971    3e-05 ***
R.D.Spend        8.128e-01  5.361e-02  15.161    5e-15 ***
Administration  -2.629e-02  6.786e-02  -0.387   0.7014    
Marketing.Spend  4.372e-02  2.324e-02   1.881   0.0704 .  
State3          -9.952e+02  3.437e+03  -0.290   0.7743    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9345 on 28 degrees of freedom
  (17 observations deleted due to missingness)
Multiple R-squared:  0.9534,	Adjusted R-squared:  0.9467 
F-statistic: 143.1 on 4 and 28 DF,  p-value: < 2.2e-16


In [11]:
# Optional Step: Remove State3 only (as opposed to removing State directly)
# regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + factor(State, exclude = 3),
#                data = dataset)
# summary(regressor)
regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend,
               data = dataset)
summary(regressor)


Call:
lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend, 
    data = dataset)

Residuals:
   Min     1Q Median     3Q    Max 
-33534  -4795     63   6606  17275 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      5.012e+04  6.572e+03   7.626 1.06e-09 ***
R.D.Spend        8.057e-01  4.515e-02  17.846  < 2e-16 ***
Administration  -2.682e-02  5.103e-02  -0.526    0.602    
Marketing.Spend  2.723e-02  1.645e-02   1.655    0.105    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9232 on 46 degrees of freedom
Multiple R-squared:  0.9507,	Adjusted R-squared:  0.9475 
F-statistic:   296 on 3 and 46 DF,  p-value: < 2.2e-16


In [12]:
regressor = lm(formula = Profit ~ R.D.Spend + Marketing.Spend,
               data = dataset)
summary(regressor)


Call:
lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = dataset)

Residuals:
   Min     1Q Median     3Q    Max 
-33645  -4632   -414   6484  17097 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     4.698e+04  2.690e+03  17.464   <2e-16 ***
R.D.Spend       7.966e-01  4.135e-02  19.266   <2e-16 ***
Marketing.Spend 2.991e-02  1.552e-02   1.927     0.06 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9161 on 47 degrees of freedom
Multiple R-squared:  0.9505,	Adjusted R-squared:  0.9483 
F-statistic: 450.8 on 2 and 47 DF,  p-value: < 2.2e-16


In [13]:
regressor = lm(formula = Profit ~ R.D.Spend,
               data = dataset)
summary(regressor)


Call:
lm(formula = Profit ~ R.D.Spend, data = dataset)

Residuals:
   Min     1Q Median     3Q    Max 
-34351  -4626   -375   6249  17188 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 4.903e+04  2.538e+03   19.32   <2e-16 ***
R.D.Spend   8.543e-01  2.931e-02   29.15   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9416 on 48 degrees of freedom
Multiple R-squared:  0.9465,	Adjusted R-squared:  0.9454 
F-statistic: 849.8 on 1 and 48 DF,  p-value: < 2.2e-16


## We can see that R is quite impressive compared to python with respect to their Data Science applicatons.

In [15]:
# For automatic implementation of Backward Elimination
backwardElimination <- function(x, sl) {
    numVars = length(x)
    for (i in c(1:numVars)){
      regressor = lm(formula = Profit ~ ., data = x)
      maxVar = max(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"])
      if (maxVar > sl){
        j = which(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"] == maxVar)
        x = x[, -j]
      }
      numVars = numVars - 1
    }
    return(summary(regressor))
  }
  
  SL = 0.05
  dataset = dataset[, c(1,2,3,4,5)]
  backwardElimination(training_set, SL)


Call:
lm(formula = Profit ~ ., data = x)

Residuals:
   Min     1Q Median     3Q    Max 
-34334  -4894   -340   6752  17147 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 4.902e+04  2.748e+03   17.84   <2e-16 ***
R.D.Spend   8.563e-01  3.357e-02   25.51   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9836 on 38 degrees of freedom
Multiple R-squared:  0.9448,	Adjusted R-squared:  0.9434 
F-statistic: 650.8 on 1 and 38 DF,  p-value: < 2.2e-16
