<a href="https://colab.research.google.com/github/agarwalpratik/aiml/blob/main/MelbBaggingRegressorShufflingSampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import pandas as pd
import numpy as np

In [25]:
data = pd.read_csv("melb_data.csv")
data = data[['Car', 'Landsize', 'BuildingArea', 'YearBuilt','Price']]
data.head()

Unnamed: 0,Car,Landsize,BuildingArea,YearBuilt,Price
0,1.0,202.0,,,1480000.0
1,0.0,156.0,79.0,1900.0,1035000.0
2,0.0,134.0,150.0,1900.0,1465000.0
3,1.0,94.0,,,850000.0
4,2.0,120.0,142.0,2014.0,1600000.0


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car           13518 non-null  float64
 1   Landsize      13580 non-null  float64
 2   BuildingArea  7130 non-null   float64
 3   YearBuilt     8205 non-null   float64
 4   Price         13580 non-null  float64
dtypes: float64(5)
memory usage: 530.6 KB


In [28]:
data['Landsize'].isna().sum(), data['Car'].isna().sum(), data['BuildingArea'].isna().sum(), data['YearBuilt'].isna().sum()

(0, 62, 6450, 5375)

In [29]:
# Statistical Approach
# Considering Car parking & YearBuilt as discrete numeric and could be defaulted with median()
data['Car'].fillna(data['Car'].median(), inplace=True)
data['YearBuilt'].fillna(data['YearBuilt'].median(), inplace=True)

# Considering BuildingArea as continuous numeric and could be defaulted with mean()
data['BuildingArea'].fillna(data['BuildingArea'].mean(), inplace=True)
data.describe()

#Domain approach
#mb_domain = mb
#mb_domain['Car'].fillna(1, inplace=True)
#mb_domain['Car'].replace(0,1,inplace=True)
#mb_domain['BuildingArea'].fillna(80, inplace=True)
#mb_domain['BuildingArea'].replace(0,80, inplace=True)
#mb_domain['YearBuilt'].fillna(2000, inplace=True)
#mb_domain.describe()

Unnamed: 0,Car,Landsize,BuildingArea,YearBuilt,Price
count,13580.0,13580.0,13580.0,13580.0,13580.0
mean,1.611856,558.416127,151.96765,1966.788218,1075684.0
std,0.960793,3990.669241,392.002962,29.088642,639310.7
min,0.0,0.0,0.0,1196.0,85000.0
25%,1.0,177.0,122.0,1960.0,650000.0
50%,2.0,440.0,151.96765,1970.0,903000.0
75%,2.0,651.0,151.96765,1975.0,1330000.0
max,10.0,433014.0,44515.0,2018.0,9000000.0


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car           13580 non-null  float64
 1   Landsize      13580 non-null  float64
 2   BuildingArea  13580 non-null  float64
 3   YearBuilt     13580 non-null  float64
 4   Price         13580 non-null  float64
dtypes: float64(5)
memory usage: 530.6 KB


In [33]:
features = data.iloc[:,[0,1,2,3]].values
label = data.iloc[:,4].values

# **Shuffling**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

rows, cols = (3, 5)
arr = [[0 for i in range(cols)] for j in range(rows)]

for rs in range(1,10):
  X_train,X_test,y_train,y_test = train_test_split(features,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=rs)

  algoLR = LinearRegression()
  algoDT = DecisionTreeRegressor()
  algoKN = KNeighborsRegressor()

  for algo in (algoLR,algoDT,algoKN):
    for rst in range(1,50):
      ensembleModel = BaggingRegressor(n_estimators=15, estimator=algo, random_state=rst)
      ensembleModel.fit(X_train,y_train)
      testScore = ensembleModel.score(X_test,y_test)
      trainScore = ensembleModel.score(X_train,y_train)
      if testScore > trainScore:
        if algo == algoLR:
          if arr[0][0] < testScore:
            arr[0][0] = testScore
            arr[0][1] = trainScore
            arr[0][2] = rs
            arr[0][3] = rst
            arr[0][4] = algo
        if algo == algoDT:
          if arr[1][0] < testScore:
            arr[1][0] = testScore
            arr[1][1] = trainScore
            arr[1][2] = rs
            arr[1][3] = rst
            arr[1][4] = algo
        if algo == algoKN:
          if arr[2][0] < testScore:
            arr[2][0] = testScore
            arr[2][1] = trainScore
            arr[2][2] = rs
            arr[2][3] = rst
            arr[2][4] = algo

print(f"testScore, trainScore, outerRandomState, innerRandomState, algorithm")
for row in arr:
    print(row)

# Sampling with Replacement



In [42]:
import warnings
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

rows, cols = (3, 5)
arr = [[0 for i in range(cols)] for j in range(rows)]

for rs in range(1,10):

  sampleSize = int(round(np.sqrt(len(X_train))))
  X_train,X_test,y_train,y_test = train_test_split(features,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=rs)

  sampleSize = int(round(np.sqrt(len(X_train))))

  algoLR = LinearRegression()
  algoDT = DecisionTreeRegressor()
  algoKN = KNeighborsRegressor()

  for algo in (algoLR,algoDT,algoKN):
    for rst in range(1,50):
      ensembleModel = BaggingRegressor(n_estimators=15, estimator=algo, max_samples=sampleSize, bootstrap=True, random_state=rst)
      ensembleModel.fit(X_train,y_train)
      testScore = ensembleModel.score(X_test,y_test)
      trainScore = ensembleModel.score(X_train,y_train)
      if testScore > trainScore:
        if algo == algoLR:
          if arr[0][0] < testScore:
            arr[0][0] = testScore
            arr[0][1] = trainScore
            arr[0][2] = rs
            arr[0][3] = rst
            arr[0][4] = algo
        if algo == algoDT:
          if arr[1][0] < testScore:
            arr[1][0] = testScore
            arr[1][1] = trainScore
            arr[1][2] = rs
            arr[1][3] = rst
            arr[1][4] = algo
        if algo == algoKN:
          if arr[2][0] < testScore:
            arr[2][0] = testScore
            arr[2][1] = trainScore
            arr[2][2] = rs
            arr[2][3] = rst
            arr[2][4] = algo

print(f"testScore, trainScore, outerRandomState, innerRandomState, algorithm")
for row in arr:
    print(row)

[0.26066378317000327, -6.965667822749635, 5, 49, LinearRegression()]
[0.3993041036448274, 0.3642629909006523, 4, 24, DecisionTreeRegressor()]
[0.26593966416948134, 0.23609275090477655, 1, 48, KNeighborsRegressor()]



# Sampling without Replacement

In [21]:
import warnings
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

rows, cols = (3, 5)
arr = [[0 for i in range(cols)] for j in range(rows)]

for rs in range(1,10):

  sampleSize = int(round(np.sqrt(len(X_train))))
  X_train,X_test,y_train,y_test = train_test_split(features,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=rs)

  sampleSize = int(round(np.sqrt(len(X_train))))

  algoLR = LinearRegression()
  algoDT = DecisionTreeRegressor()
  algoKN = KNeighborsRegressor()

  for algo in (algoLR,algoDT,algoKN):
    for rst in range(1,50):
      ensembleModel = BaggingRegressor(n_estimators=15, estimator=algo, max_samples=sampleSize, bootstrap=False, random_state=rst)
      ensembleModel.fit(X_train,y_train)
      testScore = ensembleModel.score(X_test,y_test)
      trainScore = ensembleModel.score(X_train,y_train)
      if testScore > trainScore:
        if algo == algoLR:
          if arr[0][0] < testScore:
            arr[0][0] = testScore
            arr[0][1] = trainScore
            arr[0][2] = rs
            arr[0][3] = rst
            arr[0][4] = algo
        if algo == algoDT:
          if arr[1][0] < testScore:
            arr[1][0] = testScore
            arr[1][1] = trainScore
            arr[1][2] = rs
            arr[1][3] = rst
            arr[1][4] = algo
        if algo == algoKN:
          if arr[2][0] < testScore:
            arr[2][0] = testScore
            arr[2][1] = trainScore
            arr[2][2] = rs
            arr[2][3] = rst
            arr[2][4] = algo

print(f"testScore, trainScore, outerRandomState, innerRandomState, algorithm")
for row in arr:
    print(row)

Training Score is 0.8800477685265955 and Testing Score is 0.9313607870532546 and rs is 1 and rst is 1 and algo is LinearRegression()
Training Score is 0.8538527860392727 and Testing Score is 0.9218734342154324 and rs is 1 and rst is 8 and algo is LinearRegression()
Training Score is 0.9367782769871448 and Testing Score is 0.9644365504248811 and rs is 1 and rst is 9 and algo is LinearRegression()
Training Score is 0.9225074229624851 and Testing Score is 0.9336549279177995 and rs is 1 and rst is 11 and algo is LinearRegression()
Training Score is 0.8646691655429606 and Testing Score is 0.9437999697463562 and rs is 1 and rst is 13 and algo is LinearRegression()
Training Score is 0.8840531861367716 and Testing Score is 0.9386367888594562 and rs is 1 and rst is 15 and algo is LinearRegression()
Training Score is 0.8924012164321201 and Testing Score is 0.9034366955320591 and rs is 1 and rst is 16 and algo is LinearRegression()
Training Score is 0.9297493892327889 and Testing Score is 0.95783