# Trying Backward Elimination with StartUps Profit Dataset

In [17]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import statsmodels.formula.api as sm
from sklearn.linear_model import LinearRegression
from sklearn import model_selection as cv

In [18]:
def score(Y_true,Y_predicted):
    nr = ((Y_true - Y_predicted)*(Y_true-Y_predicted)).sum()
    dr = ((Y_true - Y_true.mean())*(Y_true-Y_true.mean())).sum()
    result = 1 - (nr/dr)
    return result

## 1. Data Preprocessing

- Load the DataSet

In [8]:
data = pd.read_csv("../data/50_Startups.csv")
X = data.iloc[:,:-1].values
Y = data.iloc[:,-1].values

- Label Encoding to change strings to integers

In [9]:
labelEncoder_X = LabelEncoder()
X[:,3] = labelEncoder_X.fit_transform(X[:,3])

- One Hot Encoding to change categorical integers to dummy variables

In [10]:
oneHotEncoder = OneHotEncoder(categorical_features = [3])
X = oneHotEncoder.fit_transform(X).toarray()

- Avoiding Dummy Variable Trap

In [11]:
#Taking one less dummy variable from each set of categorical dummy variables
X = X[:,1:]

## 2. Results without using Backward Elimination

In [20]:
X_train, X_test, Y_train, Y_test = cv.train_test_split(X,Y,test_size=0.2,random_state=0)
regressor = LinearRegression()
regressor.fit(X_train,Y_train)
Y_pred = regressor.predict(X_test)
score(Y_test,Y_pred) #Score with all-in model

0.93470684732824461

## 3. Using Backward Elimination

#### Step 1 (Selecting Significance Level)

In [22]:
#Adding column of 1s for constant
X = np.append(arr = np.ones((X.shape[0],1)).astype(int),values = X,axis=1)
significance_level = 0.05

#### Step 2 (Include all features)

In [23]:
included_columns = [i for i in range(X.shape[-1])]

#### Step 3 and Step 4 (Fit Model with features available in included_columns and find p values for all untill all p values are less than significance level)

In [24]:
finished = False
while not finished:
    X_opt = X[:,included_columns]
    regressor_OLS = sm.OLS(endog=Y,exog=X_opt).fit()
    #print(regressor_OLS.summary())
    table_data = regressor_OLS.summary().tables[1].data
    p_values = [float(table_data[i][4]) for i in range(1,len(table_data))]
    if(max(p_values) > 0.05): 
        column_to_remove = p_values.index(max(p_values))
        #print("Removing column",included_columns[column_to_remove])
        del included_columns[column_to_remove]
    else:
        finished = True

#### Step 5 (Use available features to build the model

In [25]:
#Modifying X according to results of backward elimination
X = X[:,included_columns]
X_train, X_test, Y_train, Y_test = cv.train_test_split(X,Y,test_size=0.2,random_state=0)
regressor = LinearRegression()
regressor.fit(X_train,Y_train)
Y_pred = regressor.predict(X_test)
score(Y_test,Y_pred) #Score with all-in model

0.94645876077872204