# Goal - To understand Feature Selection Technique (Feature Elimination)

In [1]:
import numpy as np
import pandas as pd

In [2]:
startupData = pd.read_csv('50_Startups.csv')

# Method 1 -- Using Shortcut (Correlation Analysis)

In [3]:
startupData.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [6]:
finalDataset = pd.concat([pd.get_dummies(startupData.State), startupData.iloc[:,[0,1,2,4]]] ,axis = 1)
finalDataset.head()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
0,0,0,1,165349.2,136897.8,471784.1,192261.83
1,1,0,0,162597.7,151377.59,443898.53,191792.06
2,0,1,0,153441.51,101145.55,407934.54,191050.39
3,0,0,1,144372.41,118671.85,383199.62,182901.99
4,0,1,0,142107.34,91391.77,366168.42,166187.94


In [7]:
#Get the corr
finalDataset.corr()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
California,1.0,-0.492366,-0.515152,-0.143165,-0.015478,-0.168875,-0.145837
Florida,-0.492366,1.0,-0.492366,0.105711,0.010493,0.205685,0.116244
New York,-0.515152,-0.492366,1.0,0.039068,0.005145,-0.03367,0.031368
R&D Spend,-0.143165,0.105711,0.039068,1.0,0.241955,0.724248,0.9729
Administration,-0.015478,0.010493,0.005145,0.241955,1.0,-0.032154,0.200717
Marketing Spend,-0.168875,0.205685,-0.03367,0.724248,-0.032154,1.0,0.747766
Profit,-0.145837,0.116244,0.031368,0.9729,0.200717,0.747766,1.0


In [None]:
#Threshold is ideally decided by Data Scientist
# Suggestion by Prashant Nair: Select those features who are greater than 50% corr !!! (Regression)
# Suggestion by Prashant Nair: For Classification, avoid using this technique. If you are forced, threshold must be 80% or more

In [8]:
#Conclusion: Based on Corr Analysis, R&D spend and Marketing Spend will be the best features to create the model. Thus eliminate the remaining

In [9]:
#Seperate your data as features and label
features = startupData.iloc[:,[0,2]].values
label = startupData.iloc[:,[4]].values

In [10]:
#Create Train Test Split

# The way how we sample the data directly impacts the model's performance. This is statistically proven hence using this as an optimization mechanis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

for randomState in range(1,51):
    
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=randomState)
    model = LinearRegression()
    model.fit(X_train,y_train)
    
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    
    if test_score > train_score:
        print("Test Score: {} Training Score: {} Seed: {}".format(test_score,train_score,randomState))
    

Test Score: 0.9646437135748334 Training Score: 0.9423713608840597 Seed: 1
Test Score: 0.9814177491535382 Training Score: 0.9388772018080951 Seed: 2
Test Score: 0.9612876028942208 Training Score: 0.9459212470646745 Seed: 3
Test Score: 0.9674854200887459 Training Score: 0.9455331844858769 Seed: 4
Test Score: 0.9683604384024199 Training Score: 0.9436198878593198 Seed: 5
Test Score: 0.9909864896179557 Training Score: 0.9382176532996814 Seed: 10
Test Score: 0.9565036617363222 Training Score: 0.9472766838360558 Seed: 12
Test Score: 0.9499139926727364 Training Score: 0.9485793735881266 Seed: 13
Test Score: 0.97371375097723 Training Score: 0.9410506991241665 Seed: 14
Test Score: 0.9475480505951939 Training Score: 0.9471972623797911 Seed: 17
Test Score: 0.9658516680116018 Training Score: 0.9457936305980141 Seed: 21
Test Score: 0.9764404302143168 Training Score: 0.9421707561468369 Seed: 22
Test Score: 0.9692234650571673 Training Score: 0.94616489888384 Seed: 24
Test Score: 0.9613574909391511 Tra

In [None]:
#Conclusion: Using R&D and MArk gave a better model .THus this model can be shortlisted for deployment

# Method 2: Feature Selection using Recursive Feature Elimination (RFE)

In [11]:
# RFE can be applied only to the following algorithms
#
# 1. Algorithms that support coeff variables - Regression (LinearRegression, SVM, DecisionTreeRegression, RandomForestRegression)
# 2. Algorithms that support feature importance variable - Classification( DecisionTreeClassifier, RandomForestClassifier)

In [12]:
finalDataset.head()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
0,0,0,1,165349.2,136897.8,471784.1,192261.83
1,1,0,0,162597.7,151377.59,443898.53,191792.06
2,0,1,0,153441.51,101145.55,407934.54,191050.39
3,0,0,1,144372.41,118671.85,383199.62,182901.99
4,0,1,0,142107.34,91391.77,366168.42,166187.94


In [15]:
featuresForRFE = finalDataset.iloc[:,:-1].values
labelForRFE = finalDataset.iloc[:,-1].values

In [13]:
#Steps to apply RFE:
# 1. Initialize the algo
# 2. Apply RFE on model
# 3. Get Features with High Rank

In [17]:
# 1. Initialize the algo

from sklearn.linear_model import LinearRegression
modelForRFE = LinearRegression()

# 2. Apply RFE on model
from sklearn.feature_selection import RFE

#estimator contains the algorithm model object
# step defines how many features to be eliminated per iteration (Suggestion by PN: Always select 1 feature per iteration (Backward Elimination algo))
selectFeaturesFromRFE = RFE(estimator= modelForRFE,
                           step=1)

selectFeaturesFromRFE.fit(featuresForRFE,labelForRFE)

#3. Check Feature Ranks
print(finalDataset.columns)
print(selectFeaturesFromRFE.ranking_)
print(selectFeaturesFromRFE.support_)

Index(['California', 'Florida', 'New York', 'R&D Spend', 'Administration',
       'Marketing Spend', 'Profit'],
      dtype='object')
[1 1 1 2 3 4]
[ True  True  True False False False]


In [None]:
#Conclusion from RFE -- RFE says the profit can be determined based on the location of the company

In [18]:
#Seperate your data as features and label
features = finalDataset.iloc[:,[0,1,2]].values
label = finalDataset.iloc[:,[6]].values

In [19]:
#Create Train Test Split

# The way how we sample the data directly impacts the model's performance. This is statistically proven hence using this as an optimization mechanis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

for randomState in range(1,51):
    
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=randomState)
    model = LinearRegression()
    model.fit(X_train,y_train)
    
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    
    if test_score > train_score:
        print("Test Score: {} Training Score: {} Seed: {}".format(test_score,train_score,randomState))
    

Test Score: 0.039661078925242066 Training Score: 0.004730602579624543 Seed: 6
Test Score: 0.015080875800281057 Training Score: 0.0017731552271403883 Seed: 32
Test Score: 0.02532863096371374 Training Score: 0.013479770737339636 Seed: 49


In [None]:
#Conclusion from PN(Data Scientist): The Observations derived from RFE doesnt satisfy the accuracy requirement. Thus rejected

# Method 3 - Univariate Analysis using ANOVA

In [20]:
finalDataset.head()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
0,0,0,1,165349.2,136897.8,471784.1,192261.83
1,1,0,0,162597.7,151377.59,443898.53,191792.06
2,0,1,0,153441.51,101145.55,407934.54,191050.39
3,0,0,1,144372.41,118671.85,383199.62,182901.99
4,0,1,0,142107.34,91391.77,366168.42,166187.94


In [21]:
featuresForANOVA = finalDataset.iloc[:,:-1].values
labelForANOVA = finalDataset.iloc[:,-1].values

In [22]:
from sklearn.feature_selection import  SelectPercentile
from sklearn.feature_selection import f_regression #This is for regression algo
#from sklearn.feature_selection import f_classif  #This is for classification algo

selectFeaturesFromANOVA = SelectPercentile(percentile=50, score_func=f_regression)

selectFeaturesFromANOVA.fit(featuresForANOVA,labelForANOVA)

print(finalDataset.columns)
print(selectFeaturesFromANOVA.get_support())

Index(['California', 'Florida', 'New York', 'R&D Spend', 'Administration',
       'Marketing Spend', 'Profit'],
      dtype='object')
[False False False  True  True  True]


In [23]:
#Conclusion from ANOVA -- Profit of the company completely depends on the spending pattern of your funds(R&D, Adm, Mark)

In [24]:
#Seperate your data as features and label
features = finalDataset.iloc[:,[3,4,5]].values
label = finalDataset.iloc[:,[6]].values

In [25]:
#Create Train Test Split

# The way how we sample the data directly impacts the model's performance. This is statistically proven hence using this as an optimization mechanis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

for randomState in range(1,51):
    
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=randomState)
    model = LinearRegression()
    model.fit(X_train,y_train)
    
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    
    if test_score > train_score:
        print("Test Score: {} Training Score: {} Seed: {}".format(test_score,train_score,randomState))
    

Test Score: 0.9650940934427324 Training Score: 0.9424026074100899 Seed: 1
Test Score: 0.9794909902425315 Training Score: 0.9397277457511108 Seed: 2
Test Score: 0.9604474684992019 Training Score: 0.946940634516175 Seed: 4
Test Score: 0.9689042545732487 Training Score: 0.9436742073775848 Seed: 5
Test Score: 0.991745901723568 Training Score: 0.9383444814482027 Seed: 10
Test Score: 0.9497398332848894 Training Score: 0.9485205028772793 Seed: 12
Test Score: 0.9729719984514295 Training Score: 0.9410725215907201 Seed: 14
Test Score: 0.9658747497132993 Training Score: 0.9457936541281106 Seed: 21
Test Score: 0.9766179840086032 Training Score: 0.942474686754789 Seed: 22
Test Score: 0.9699853611184212 Training Score: 0.9463969619029229 Seed: 24
Test Score: 0.9615581146744921 Training Score: 0.9452671454762599 Seed: 26
Test Score: 0.9511342679296733 Training Score: 0.9481498306863594 Seed: 29
Test Score: 0.9570638759424795 Training Score: 0.9431922326920478 Seed: 31
Test Score: 0.9655432049381097 T

In [None]:
#Conclusion from PN(Data Scientist): Currently the optimized model for this usecase

# Method 4 : Select Features by Algo (SelectByModel) 

In [26]:
finalDataset.head()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
0,0,0,1,165349.2,136897.8,471784.1,192261.83
1,1,0,0,162597.7,151377.59,443898.53,191792.06
2,0,1,0,153441.51,101145.55,407934.54,191050.39
3,0,0,1,144372.41,118671.85,383199.62,182901.99
4,0,1,0,142107.34,91391.77,366168.42,166187.94


In [27]:
featuresForModel = finalDataset.iloc[:,:-1].values
labelForModel = finalDataset.iloc[:,-1].values

In [29]:
# 1. Initialize the algo

from sklearn.linear_model import LinearRegression
modelForModel = LinearRegression()

# 2. Apply Select By Model

from sklearn.feature_selection import SelectFromModel
selectFeaturesFromModel = SelectFromModel(modelForModel)

selectFeaturesFromModel.fit(featuresForModel,labelForModel)


#Get the support
print(finalDataset.columns)
print(selectFeaturesFromModel.get_support())

Index(['California', 'Florida', 'New York', 'R&D Spend', 'Administration',
       'Marketing Spend', 'Profit'],
      dtype='object')
[ True  True  True False False False]


In [30]:
#Conclusion by SelectByModel - says the profit can be determined based on the location of the company

In [None]:
# We already received the same reading in RFE . So REJECTED !!!

In [31]:
#Report Time :)

In [47]:
columns = ['AllIn', 'Correlation Analysis','RFE', 'ANOVA', 'SelectFroMModel']
accValues= [[0.9901105113397478,0.9909864896179557,0.039661078925242066,0.991745901723568,0.039661078925242066]]

df = pd.DataFrame(accValues,columns=columns)

df.style

Unnamed: 0,AllIn,Correlation Analysis,RFE,ANOVA,SelectFroMModel
0,0.990111,0.990986,0.0396611,0.991746,0.0396611


In [None]:
#Final Conclusion
#Use Anova Model !!!!