In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from statsmodels.api import OLS
from itertools import combinations

In [2]:
# Importing Data
df = pd.read_csv('50_startups.csv')

In [3]:
# Data
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
# Data Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [5]:
# Data Shape
df.shape

(50, 5)

In [6]:
# Features and Class
X = df.iloc[:, :-2]
y = df.iloc[:, 4]

In [7]:
# Dummy Encoding
X = pd.concat([X, pd.get_dummies(df.iloc[:, 3], prefix='State_').iloc[:, :-1]], axis=1)

In [8]:
# Splitting Dataset into Training Set and Testing Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### All-in Strategy

In [9]:
# Linear Regression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [10]:
# Predicting the Test Set Results
y_pred = regressor.predict(X_test)

In [11]:
# Evaluating Model using R-Squared Method
r2 = r2_score(y_test, y_pred)
print(r2)

0.9629545859813348


###### The Equation of Multiple Regression is given by:<br>Y = $\beta$<sub>0</sub> + $\beta$<sub>1</sub>X<sub>1</sub> + $\beta$<sub>2</sub>X<sub>2</sub> + ... + $\beta$<sub>n</sub>X<sub>n</sub><br><br>In order to satisfy this equation we can also write it down as:<br>Y = $\beta$<sub>0</sub>X<sub>0</sub> + $\beta$<sub>1</sub>X<sub>1</sub> + $\beta$<sub>2</sub>X<sub>2</sub> + ... + $\beta$<sub>n</sub>X<sub>n</sub><br><br>where, X<sub>0</sub> = 1, that means X<sub>0</sub> is a Matrix of Ones. Therefore in order to satisfy this condition, we need to add a numpy array of Ones.

In [12]:
# Adding Feature of Ones to satisfy equation of Multiple Linear Regression
optimizer = pd.Series(np.ones((50, )), name='Optimizer')
X = pd.concat([optimizer, X], axis=1)

In [13]:
# Data After Addition
X.head()

Unnamed: 0,Optimizer,R&D Spend,Administration,Marketing Spend,State__California,State__Florida
0,1.0,165349.2,136897.8,471784.1,0,0
1,1.0,162597.7,151377.59,443898.53,1,0
2,1.0,153441.51,101145.55,407934.54,0,1
3,1.0,144372.41,118671.85,383199.62,0,0
4,1.0,142107.34,91391.77,366168.42,0,1


### Forward Selection

In [14]:
def find_smallest(arr, s_level, idx_avlbl):
    '''
    Find Smallest Element's Index
    Args:
        arr::[list]
            Input List from which Minimum Element's Index needs to be find
        idx_avlbl::[list]
            List of Available Indices of Features
    Returns:
        smallest_idx::[int]
            Returns the Index of Smallest Element
    '''
    smallest_num = None
    smallest_idx = None
    for i in range(len(arr)):
        if not smallest_num:
            smallest_num = arr[i]
            smallest_idx = i
        elif arr[i] < smallest_num:
            smallest_num = arr[i]
            smallest_idx = i
    if smallest_num[0] < s_level:
        return idx_avlbl[smallest_idx]
    else:
        return None

In [15]:
# Forward Selection
var_idx = [i for i in range(1, 6)]
curr_set = [0]
for r in range(1, 6):
    p_vals = []
    for combo in combinations(var_idx, 1):
        if combo[0] not in curr_set:
            temp_set = curr_set + list(combo)
            X_opt = X.iloc[:, temp_set]
            regressor = OLS(endog=y, exog=X_opt).fit()
            p_vals.append(list(regressor.pvalues)[r:])
    idx_avlbl = list(set(var_idx) - set(curr_set))
    idx = find_smallest(arr=p_vals, s_level=0.05, idx_avlbl=idx_avlbl)
    if idx:
        curr_set += [idx]
    else:
        break
print(curr_set)

[0, 1]


### Backward Selection

In [16]:
def find_largest(arr, s_level, idx_avlbl):
    '''
    Find Largest Element
    Args:
        arr::[list]
            Input List from which Largest Element needs to be find
        s_level::[float]
            Significance Level set to stay in the Model
        idx_avlbl::[list]
            List of Available Indices of Features
    Returns:
        largest_idx::[int]
            Returns Largest Element
    '''
    largest_num = None
    for i in range(len(arr)):
        if not largest_num:
            largest_num = arr[i]
        elif arr[i] > largest_num:
            largest_num = arr[i]
    if largest_num > s_level:
        return largest_num
    else:
        return None

In [17]:
# Backward Selection
var_idx = [i for i in range(1, 6)]
curr_set = [i for i in range(6)]
for r in range(5):
    idx_avlbl = curr_set[1:]
    p_vals = []
    X_opt = X.iloc[:, curr_set]
    regressor = OLS(endog=y, exog=X_opt).fit()
    p_vals += list(regressor.pvalues)[1:]
    largest_num = find_largest(arr=p_vals, s_level=0.05, idx_avlbl=idx_avlbl)
    if largest_num:
        idx = p_vals.index(largest_num) + 1
        curr_set.pop(idx)
        p_vals.remove(largest_num)
    else:
        break
print(curr_set)

[0, 1]
