# The train-test split is a crucial step in the model development process, and it helps in evaluating the performance of a machine learning model. This process involves splitting the dataset into two subsets: one for training the model and the other for testing or validating the model.

In [15]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
!pip install mlxtend



# Create a synthetic dataset for demonstration:

In [2]:
X,y = make_classification(n_samples=1000, n_features=10, n_classes =2, random_state=42)

In [6]:
X
# type: Array

array([[ 0.96479937, -0.06644898,  0.98676805, ..., -1.2101605 ,
        -0.62807677,  1.22727382],
       [-0.91651053, -0.56639459, -1.00861409, ..., -0.98453405,
         0.36389642,  0.20947008],
       [-0.10948373, -0.43277388, -0.4576493 , ..., -0.2463834 ,
        -1.05814521, -0.29737608],
       ...,
       [ 1.67463306,  1.75493307,  1.58615382, ...,  0.69272276,
        -1.50384972,  0.22526412],
       [-0.77860873, -0.83568901, -0.19484228, ..., -0.49735437,
         2.47213818,  0.86718741],
       [ 0.24845351, -1.0034389 ,  0.36046013, ...,  0.77323999,
         0.1857344 ,  1.41641179]])

In [7]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.964799,-0.066449,0.986768,-0.358079,0.997266,1.181890,-1.615679,-1.210161,-0.628077,1.227274
1,-0.916511,-0.566395,-1.008614,0.831617,-1.176962,1.820544,1.752375,-0.984534,0.363896,0.209470
2,-0.109484,-0.432774,-0.457649,0.793818,-0.268646,-1.836360,1.239086,-0.246383,-1.058145,-0.297376
3,1.750412,2.023606,1.688159,0.006800,-1.607661,0.184741,-2.619427,-0.357445,-1.473127,-0.190039
4,-0.224726,-0.711303,-0.220778,0.117124,1.536061,0.597538,0.348645,-0.939156,0.175915,0.236224
...,...,...,...,...,...,...,...,...,...,...
995,-1.367638,1.462255,-1.154918,-0.290454,-0.413424,0.032396,1.545490,1.428760,1.687092,1.072542
996,-1.514876,-3.221016,-1.300744,0.395599,-0.527994,1.353069,1.777506,-1.680870,1.798510,0.034272
997,1.674633,1.754933,1.586154,0.018402,-1.514470,0.321593,-2.417694,0.692723,-1.503850,0.225264
998,-0.778609,-0.835689,-0.194842,1.097220,0.180071,-0.272933,-0.533188,-0.497354,2.472138,0.867187


# Split the dataset into training and testing sets (80-20)

In [4]:
X_train, X_test, y_train, y_test = train_test_split (X,y, test_size = 0.30, random_state = 42)
# test_size = 0.30 means 30% data will be used for testing
# rest 70% is for training

# random_state = 42:
# This is a seed value to ensure that the data splitting is reproducible.
# Using the same seed value (like 42) will produce the same split every time, 
# which is helpful for debugging or consistent results.


In [5]:
X_train

array([[ 0.04446722,  0.91377162, -0.14972402, ...,  0.57407122,
        -0.66673831,  1.58983898],
       [ 0.41722807, -0.74594213,  0.6030946 , ..., -0.61453878,
         0.30463008,  0.75341946],
       [ 0.83868939,  0.86018264,  0.66388059, ...,  0.72508104,
        -1.17953018, -1.15545179],
       ...,
       [ 0.04091467,  0.42796263, -0.00235298, ..., -1.92277177,
        -0.17104772,  1.63925652],
       [ 0.3393792 , -0.64015446,  0.25253995, ..., -1.39922826,
        -0.52991248, -0.7348109 ],
       [ 1.84815207, -0.31305803,  1.78069642, ...,  0.60451533,
        -1.56102804, -1.25040773]])

In [9]:
X_train.shape

(700, 10)

In [10]:
X_test.shape

(300, 10)

In [11]:
y_test.shape

(300,)

In [13]:
y_train.shape

(700,)

# Overfitting and Underfitting

In [16]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from mlxtend.evaluate import bias_variance_decomp

np.random.seed(0)
X = np.random.rand(100, 1) * 10
y = 2 * X.squeeze() + np.random.randn(100)  # True relationship is y = 2X + noise

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Calculate bias and variance using the bias_variance_decomp function
mse, bias, variance = bias_variance_decomp(model, X_train, y_train, X_test, y_test, loss='mse')

print('MSE (Mean Squared Error):', mse)
print('Bias^2:', bias)
print('Variance:', variance)


MSE (Mean Squared Error): 0.9365754042374553
Bias^2: 0.9173270788777023
Variance: 0.019248325359754587
