In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load your dataset - assume glass_data.csv contains the data with relevant columns
# Let's assume the 4th column is aluminum content (x) and another column is refractive index (y)
data = pd.read_csv('glass.dat.txt',sep='\s+')
# print(data)
y=data['RI']
x=data['Al']


       RI     Na    Mg    Al     Si     K    Ca    Ba   Fe  type
1    3.01  13.64  4.49  1.10  71.78  0.06  8.75  0.00  0.0  WinF
2   -0.39  13.89  3.60  1.36  72.73  0.48  7.83  0.00  0.0  WinF
3   -1.82  13.53  3.55  1.54  72.99  0.39  7.78  0.00  0.0  WinF
4   -0.34  13.21  3.69  1.29  72.61  0.57  8.22  0.00  0.0  WinF
5   -0.58  13.27  3.62  1.24  73.08  0.55  8.07  0.00  0.0  WinF
..    ...    ...   ...   ...    ...   ...   ...   ...  ...   ...
210 -1.77  14.14  0.00  2.88  72.61  0.08  9.18  1.06  0.0  Head
211 -1.15  14.92  0.00  1.99  73.06  0.00  8.40  1.59  0.0  Head
212  2.65  14.36  0.00  2.02  73.42  0.00  8.44  1.64  0.0  Head
213 -1.49  14.38  0.00  1.94  73.61  0.00  8.48  1.57  0.0  Head
214 -0.89  14.23  0.00  2.08  73.36  0.00  8.62  1.67  0.0  Head

[214 rows x 10 columns]


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load your dataset - assume glass_data.csv contains the data with relevant columns
# Let's assume the 4th column is aluminum content (x) and another column is refractive index (y)
data = pd.read_csv('glass.dat.txt',sep='\s+')
# print(data)
y=data['RI']
x=data['Al']

def gaussian_kernel(u):
    return (1 / np.sqrt(2 * np.pi)) * np.exp(-0.5 * u ** 2)

def epanechnikov_kernel(u):
    return 0.75 * (1 - u ** 2) * (np.abs(u) <= 1)

def nadaraya_watson(x_train, y_train, x_point, h, kernel):
    w=kernel((x_point-x_train)/h)
    return np.sum(w*y_train)/np.sum(w)

def lpocv(x, y, bandwidth, p=4):
    n = len(x)
    errors = []
    
    for i in range(n):
        if not i>=211:
            idx_val = np.arange(i, min(i + p, n)) 
        idx_val=idx_val.astype(int)
        idx_train = np.delete(np.arange(n), idx_val)
        
        x_train, y_train = x[idx_train], y[idx_train]
        x_val, y_val = x[idx_val], y[idx_val]
        
        y_pred = np.array([nadaraya_watson(x_train, y_train, x_v, bandwidth, gaussian_kernel) for x_v in x_val])
        
        error = np.mean((y_pred - y_val) ** 2)
        errors.append(error)
    
    
    return np.mean(errors)

def find_optimal_bandwidth(x, y, kernel, h_values):
    risks = []
    
    for h in h_values:
        risk = lpocv(x, y, kernel, h)
        risks.append(risk)
    
    optimal_h = h_values[np.argmin(risks)]
    
    return optimal_h, risks

def generate_plots(x, y, kernel, h_values, kernel_name):
    # optimal bandwidth
    optimal_h, risks = find_optimal_bandwidth(x, y, kernel, h_values)
    
    # Oversmoothed
    large_h = max(h_values)
    
    # Undersmoothed
    small_h = min(h_values)
    
    # Just Right
    just_right_h = optimal_h
    
    # Create subplots
    plt.figure(figsize=(10, 8))
    
    # Oversmoothed
    plt.subplot(2, 2, 1)
    y_pred_large_h = [nadaraya_watson(x, y, x_point, large_h, kernel) for x_point in x]
    plt.scatter(x, y, label='Data')
    plt.plot(x, y_pred_large_h, label=f'Oversmoothed (h={large_h})', color='red')
    plt.title('Oversmoothed')
    plt.legend()
    
    # Undersmoothed
    plt.subplot(2, 2, 2)
    y_pred_small_h = [nadaraya_watson(x, y, x_point, small_h, kernel) for x_point in x]
    plt.scatter(x, y, label='Data')
    plt.plot(x, y_pred_small_h, label=f'Undersmoothed (h={small_h})', color='blue')
    plt.title('Undersmoothed')
    plt.legend()
    
    # Just Right
    plt.subplot(2, 2, 3)
    y_pred_just_right_h = [nadaraya_watson(x, y, x_point, just_right_h, kernel) for x_point in x]
    plt.scatter(x, y, label='Data')
    plt.plot(x, y_pred_just_right_h, label=f'Just Right (h={just_right_h})', color='green')
    plt.title('Just Right')
    plt.legend()
    
    # Cross-Validation Curve
    plt.subplot(2, 2, 4)
    plt.plot(h_values, risks, label='Cross-Validation Risk')
    plt.scatter([optimal_h], [min(risks)], color='red', label=f'Optimal h = {optimal_h}')
    plt.title('Risk vs Bandwidth')
    plt.xlabel('Bandwidth (h)')
    plt.ylabel('Risk (MSE)')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(f"{kernel_name}_kernel_regression.png")
    plt.show()


h_values = np.linspace(0.01, 1, 50)

generate_plots(x,y,gaussian_kernel,h_values,'gaussian')

generate_plots(x,y,epanechnikov_kernel,h_values,'epanechnikov')


KeyError: "None of [Index([0], dtype='int32')] are in the [index]"