## Bias, Variance & Regularization
### This is when our data is in the form of coordinates

In [4]:
import numpy as np
import pandas as pd 
# Loading the data
# Remember to change the data as per the question
np.random.seed(42)
df = pd.DataFrame({'x1':np.random.uniform(-1, 1, size=(10_000)),\
                    'x2': np.random.uniform(-1, 1, size=(10_000)),})

# True function f(x)
def f(x):
    return np.sin(np.pi*x)

df['y1'] = f(df['x1'])
df['y2'] = f(df['x2'])

'''Constant hypothesis'''
df['g_cons'] = df[['y1','y2']].mean(axis=1)

def findEqLine(x1, x2, y1, y2):
    '''
    This function takes 2 scalers for x1, and x2, and 2 scalers for y1, and y2.
    It will find the equation of a line that passes through these 2 points using matrix inverse.
    This function outputs the intercept and slope of the line (i.e intercept = w[0], slope = w[1])
    '''
    X = np.array([[1, x1], [1, x2]])
    lamb = 0.1
    I=np.identity(2)
    w = np.linalg.pinv(X.transpose()@X)@X.transpose()@np.array([y1,y2])
    w_reg = np.linalg.pinv(X.transpose()@X+lamb*I)@X.transpose()@np.array([y1,y2])
    return(w, w_reg)

# Run the above function for all 10,000 points. This will give us 10,000 slopes and intercepts.
for i in range(df.shape[0]):
    df.loc[i,'g_line_b'] = findEqLine(df.loc[i,'x1'], df.loc[i,'x2'], df.loc[i,'y1'], df.loc[i,'y2'])[:][0][0]
    df.loc[i,'g_line_m'] = findEqLine(df.loc[i,'x1'], df.loc[i,'x2'], df.loc[i,'y1'], df.loc[i,'y2'])[:][0][1]
    df.loc[i,'g_line_b_reg'] = findEqLine(df.loc[i,'x1'], df.loc[i,'x2'], df.loc[i,'y1'], df.loc[i,'y2'])[:][1][0]
    df.loc[i,'g_line_m_reg'] = findEqLine(df.loc[i,'x1'], df.loc[i,'x2'], df.loc[i,'y1'], df.loc[i,'y2'])[:][1][1]

'''Aveerage hypothesis for each model'''

g_cons_bar = df['g_cons'].mean()
g_line_m_bar = df['g_line_m'].mean()
g_line_b_bar = df['g_line_b'].mean()
g_line_m_reg_bar = df['g_line_m_reg'].mean()
g_line_b_reg_bar = df['g_line_b_reg'].mean()
print('For constant model, avg g(x)=', np.round(g_cons_bar,3),\
      '\nFor linear model, avg g(x)=', np.round(g_line_m_bar,3),\
      'x+',np.round(g_line_b_bar,3),\
      '\nFor linear model with regularization, avg g(x)=', np.round(g_line_m_reg_bar,3),\
      'x+',np.round(g_line_b_reg_bar,3)
     )
print("---------------------------------------------")

'''Constant model'''
# bias^2 at x = (g_bar(x) - f(x))^2
# For constant model g_bar is the same at all x's
bias_cons_atX = (df['y1']-g_cons_bar)**2
#To find bias^2 we need to find E[bias^2 at x]. => Take expected value horizontally
bias_cons = np.mean(bias_cons_atX)
print('Bias Sq for constant model is:', np.round(bias_cons,3))

var_cons = np.mean((df['g_cons']-g_cons_bar)**2)
print('Variance for constant model is:', np.round(var_cons,3))
print("---------------------------------------------")

'''Unreg Linear model'''
df['g_line_bar_atX'] = g_line_b_bar + g_line_m_bar*df['x1']
#Alternatively can use np.matmul(np.array([g_line_b_bar,g_line_m_bar]),np.array([np.ones(10000), df['x']]))
bias_linear_atX = (df['y1']-df['g_line_bar_atX'])**2
bias_linear = np.mean(bias_linear_atX)
print('Bias sq for unreg linear model is:', np.round(bias_linear,3))

g_linear_x = pd.DataFrame(np.matmul(np.array(df[['g_line_b','g_line_m']]),np.array([np.ones(10000), df['x1'] ])))
# To find g(x)-g_bar(x), every columns of g_linear_x must be subtracted from g_bar(x)
temp = g_linear_x.sub(df['g_line_bar_atX'], axis = 'columns')**2
varAt_x = temp.mean()
var_line = np.mean(varAt_x)
print('Variance for unreg linear model is:', np.round(var_line,3))
print("---------------------------------------------")

'''Reg Linear model'''
# Unlike constant model we have to evalute g_bar at every x
df['g_line_bar_reg_atX'] = g_line_b_reg_bar + g_line_m_reg_bar*df['x1']
#Alternatively can use np.matmul(np.array([g_line_b)reg_bar,g_line_m_reg_bar]),np.array([np.ones(10000), df['x']]))
bias_linear_reg_atX = (df['y1']-df['g_line_bar_reg_atX'])**2
bias_linear_reg = np.mean(bias_linear_reg_atX)
print('Bias sq for regularized linear model is:', np.round(bias_linear_reg,3))

g_linear_reg_x = pd.DataFrame(np.matmul(np.array(df[['g_line_b_reg','g_line_m_reg']]),np.array([np.ones(10000), df['x1'] ])))
# To find g(x)-g_bar(x), every columns of g_linear_x must be subtracted from g_bar(x)
temp = g_linear_reg_x.sub(df['g_line_bar_reg_atX'], axis = 'columns')**2
varAt_reg_x = temp.mean()
var_line_reg = np.mean(varAt_reg_x)
print('Variance for unreg linear model is:', np.round(var_line_reg,3))
print("---------------------------------------------")

print("Total error for constant model is:", np.round(bias_cons+var_cons,3))
print("Total error for unreg linear model is:", np.round(bias_linear+var_line,3))
print("Total error for regularized linear model is:", np.round(bias_linear_reg+var_line_reg,3))

For constant model, avg g(x)= -0.002 
For linear model, avg g(x)= 0.786 x+ -0.001 
For linear model with regularization, avg g(x)= 0.627 x+ -0.001
---------------------------------------------
Bias Sq for constant model is: 0.497
Variance for constant model is: 0.247
---------------------------------------------
Bias sq for unreg linear model is: 0.204
Variance for unreg linear model is: 1.66
---------------------------------------------
Bias sq for regularized linear model is: 0.23
Variance for unreg linear model is: 0.329
---------------------------------------------
Total error for constant model is: 0.744
Total error for unreg linear model is: 1.864
Total error for regularized linear model is: 0.559


### Validation

In [7]:
# using kfold with custom data:

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression 


# Sample data
df = pd.read_csv()
# Create a logistic regression model
model = L

# Define the number of splits for k-fold cross-validation
n_splits = 3

# Initialize the k-fold cross-validation
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=kf)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Calculate and print the mean accuracy
print("Mean accuracy:", np.mean(cv_scores))





Cross-validation scores: [1. 1. 0.]
Mean accuracy: 0.6666666666666666


In [6]:
# using kfold with iris data:

from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create a logistic regression model
model = LogisticRegression()

# Define the number of splits for k-fold cross-validation
n_splits = 5

# Initialize the k-fold cross-validation
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=kf)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Calculate and print the mean accuracy
print("Mean accuracy:", cv_scores.mean())


Cross-validation scores: [1.         1.         0.93333333 0.96666667 0.96666667]
Mean accuracy: 0.9733333333333334


In [10]:

# using kfold with a dataset:

import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# # Load the data from the CSV file
data = pd.read_csv("example.csv")

# # Assuming the target column is named 'target', if not replace it with your target column name
X = data.drop('target', axis=1)
y = data['target']

# # Create a logistic regression model
model = LogisticRegression()

# # Define the number of splits for k-fold cross-validation
n_splits = 5

# # Initialize the k-fold cross-validation
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# # Perform k-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=kf)

# # Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# # Calculate and print the mean accuracy
print("Mean accuracy:", cv_scores.mean())

FileNotFoundError: [Errno 2] No such file or directory: 'example.csv'