<a href="https://colab.research.google.com/github/agarwalpratik/aiml/blob/main/Cross_Validation_Technique_Iris_50_Startups.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cross Validation
# Cross Validation is applied on the entire dataset (Pre-modelling Phase | EDA)
# The ideal goal of CV"
# 1. Get the score threshold / benchmark (this benchmark can act as a SL value)
# 2. Get the possible optimal score
# 3. Extract the best training sample that may provide the optimal score

In [10]:
import pandas as pd
import numpy as np

data= pd.read_csv('50_Startups.csv')

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [12]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [13]:
#Cross validation technique is only applicable for Supervised Learning

features = data.iloc[:,0:4].values
finalLabels = data.iloc[:,4].values


from sklearn.preprocessing import OneHotEncoder
oheForstate = OneHotEncoder(sparse_output=False)
oheForstate.fit(features[:,[3]])
dummyVariables = oheForstate.transform(features[:,[3]]).astype(int)


finalFeatures = np.concatenate([dummyVariables,features[:,[0,1,2]]] , axis = 1)
finalFeatures

array([[0, 0, 1, 165349.2, 136897.8, 471784.1],
       [1, 0, 0, 162597.7, 151377.59, 443898.53],
       [0, 1, 0, 153441.51, 101145.55, 407934.54],
       [0, 0, 1, 144372.41, 118671.85, 383199.62],
       [0, 1, 0, 142107.34, 91391.77, 366168.42],
       [0, 0, 1, 131876.9, 99814.71, 362861.36],
       [1, 0, 0, 134615.46, 147198.87, 127716.82],
       [0, 1, 0, 130298.13, 145530.06, 323876.68],
       [0, 0, 1, 120542.52, 148718.95, 311613.29],
       [1, 0, 0, 123334.88, 108679.17, 304981.62],
       [0, 1, 0, 101913.08, 110594.11, 229160.95],
       [1, 0, 0, 100671.96, 91790.61, 249744.55],
       [0, 1, 0, 93863.75, 127320.38, 249839.44],
       [1, 0, 0, 91992.39, 135495.07, 252664.93],
       [0, 1, 0, 119943.24, 156547.42, 256512.92],
       [0, 0, 1, 114523.61, 122616.84, 261776.23],
       [1, 0, 0, 78013.11, 121597.55, 264346.06],
       [0, 0, 1, 94657.16, 145077.58, 282574.31],
       [0, 1, 0, 91749.16, 114175.79, 294919.57],
       [0, 0, 1, 86419.7, 153514.11, 0.0],
 

In [17]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

model = LinearRegression()
score = cross_val_score(model,finalFeatures,finalLabels,cv=5)


In [18]:
score

array([ 0.8890891 , -1.00022611, -0.78958092, -0.76036588,  0.38538399])

In [19]:
# 1. Get the score threshold / benchmark (this benchmark can act as a SL value)

print("Base Threshold score expected for LogisticRegression model is ",score.mean())
print("Suggested SL value for this project is ", 1-score.mean())

Base Threshold score expected for LogisticRegression model is  -0.2551399655258996
Suggested SL value for this project is  1.2551399655258997


In [20]:
#2. Optimal Score
print("Optimal Score possible for the given algo is ", score.max())
print("Optimal Score possible for the given algo is ", score.min())

Optimal Score possible for the given algo is  0.8890890994779715
Optimal Score possible for the given algo is  -1.0002261132038623


In [21]:
#3. To extract the best training sample that may provide the optimal score

# SEARCH : Tracker value with highest testScorew
#==========
import warnings
warnings.filterwarnings("ignore")

CL = score.mean()

#Step1: Initialize the algo
from sklearn.linear_model import LinearRegression
model = LinearRegression()

#Step2: Initialize K-Fold Cross Validation

from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, #This value MUST be equal to the cv value of cross_val_score
              shuffle=True,
              random_state=1) #For reproducing same o/p

#Step3: Initialize tracker to track the best sample
tracker = 0

for train,test in kfold.split(finalFeatures):
  tracker += 1

  X_train,X_test,y_train,y_test = finalFeatures[train],finalFeatures[test],finalLabels[train],finalLabels[test]

  model.fit(X_train,y_train)

  if model.score(X_test,y_test) >= CL:
    print(f"Test Score {model.score(X_test,y_test)} Train Score {model.score(X_train,y_train)} for Sample split {tracker}")


Test Score 0.9649618042060308 Train Score 0.942446542689397 for Sample split 1
Test Score 0.8185106842705208 Train Score 0.9655319679423091 for Sample split 2
Test Score 0.8820715931491742 Train Score 0.957307628387595 for Sample split 3
Test Score 0.9846245896509758 Train Score 0.939824209339928 for Sample split 4
Test Score 0.9156298183227459 Train Score 0.9499864582841221 for Sample split 5
