<a href="https://colab.research.google.com/github/aarontavel/DATA441/blob/main/Tavel_HW2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy usearch

Collecting usearch
  Downloading usearch-2.9.0-cp310-cp310-manylinux_2_28_x86_64.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: usearch
Successfully installed usearch-2.9.0


In [None]:
# Imports

import numpy as np
from xgboost import XGBRegressor
import pandas as pd
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from scipy.spatial import Delaunay
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from scipy import linalg
from scipy.interpolate import interp1d, LinearNDInterpolator, NearestNDInterpolator
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split as tts, KFold, GridSearchCV

from scipy.spatial.distance import cdist

# the following line(s) are necessary if you want to make SKlearn compliant functions
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from math import ceil

from usearch.index import search, MetricKind, Matches, BatchMatches, Index

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 1) Gradient Boosting Class

In [None]:
# Gaussian Kernel
def Gaussian(w):
  return np.where(w>4,0,1/(np.sqrt(2*np.pi))*np.exp(-1/2*w**2))

# Tricubic Kernel
def Tricubic(w):
  return np.where(w>1,0,70/81*(1-w**3)**3)

# Quartic Kernel
def Quartic(w):
  return np.where(w>1,0,15/16*(1-w**2)**2)

# Epanechnikov Kernel
def Epanechnikov(w):
  return np.where(w>1,0,3/4*(1-w**2))

In [None]:
def lw_ag_md(x, y, xnew,f=2/3,iter=3, intercept=True):

  n = len(x)
  r = int(ceil(f * n))
  yest = np.zeros(n)

  if len(y.shape)==1: # here we make column vectors
    y = y.reshape(-1,1)

  if len(x.shape)==1:
    x = x.reshape(-1,1)

  if intercept:
    x1 = np.column_stack([np.ones((len(x),1)),x])
  else:
    x1 = x

  h = [np.sort(np.sqrt(np.sum((x-x[i])**2,axis=1)))[r] for i in range(n)]
  # dist(x,x) is always symmetric
  w = np.clip(dist(x,x) / np.array(h), 0.0, 1.0)
  # note that w is a square matrix and in Python arithmetic operations such as
  # w**3 or 1-w**3 are performed element-wise
  #w = (1-w**3)**3 # a Tricubic kernel
  w = Epanechnikov(w)

  #Looping through all X-points
  delta = np.ones(n)

  for iteration in range(iter):
    for i in range(n):
      W = np.diag(delta).dot(np.diag(w[i,:]))
      # when we multiply two diagonal matrices we get also a diagonal matrix
      b = np.transpose(x1).dot(W).dot(y)
      A = np.transpose(x1).dot(W).dot(x1)
      ##
      A = A + 0.0001*np.eye(x1.shape[1]) # if we want L2 regularization for solving the system
      beta = linalg.solve(A, b)

      #beta, res, rnk, s = linalg.lstsq(A, b)
      yest[i] = np.dot(x1[i],beta.ravel())

    residuals = y.ravel() - yest
    s = np.median(np.abs(residuals))

    delta = np.clip(residuals / (6.0 * s), -1, 1)

    delta = (1 - delta ** 2) ** 2

  # here we are making predictions for xnew by using an interpolation and the predictions we made for the train data
  if x.shape[1]==1:
    f = interp1d(x.flatten(),yest,fill_value='extrapolate')
    output = f(xnew)
  else:
    output = np.zeros(len(xnew))
    for i in range(len(xnew)):
      ind = np.argsort(np.sqrt(np.sum((x-xnew[i])**2,axis=1)))[:r]
      pca = PCA(n_components=3)
      x_pca = pca.fit_transform(x[ind])
      tri = Delaunay(x_pca,qhull_options='QJ Pp')
      f = LinearNDInterpolator(tri,yest[ind])
      output[i] = f(pca.transform(xnew[i].reshape(1,-1)))
      # the output may have NaN's where the data points from xnew are outside the convex hull of X

  if sum(np.isnan(output))>0:
    g = NearestNDInterpolator(x,yest.ravel())
    # output[np.isnan(output)] = g(X[np.isnan(output)])
    output[np.isnan(output)] = g(xnew[np.isnan(output)])
  return output

In [None]:
class Lowess_AG_MD: #Base class pulled from Professor's GitHub
    def __init__(self, f = 1/10, iter = 3,intercept=True):
        self.f = f
        self.iter = iter
        self.intercept = intercept

    def fit(self, x, y):
        f = self.f
        iter = self.iter
        self.xtrain_ = x
        self.yhat_ = y

    def is_fitted(self):
      return self._is_fitted

    def predict(self, x_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        f = self.f
        iter = self.iter
        intercept = self.intercept
        return lw_ag_md(x, y, x_new, f, iter, intercept) # this is actually our defined function of Lowess

    def get_params(self, deep=True):
    # suppose this estimator has parameters "f", "iter" and "intercept"
        return {"f": self.f, "iter": self.iter,"intercept":self.intercept}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [None]:
class GradientBoosting:
  def __init__(self, f = 1/3, epoch = 2, intercept = True, n_estimators = 3, scaler = None):
    self.f = f
    self.epoch = epoch
    self.intercept = intercept
    self.n_estimators = n_estimators
    self.estimators = []
    self.scaler = scaler
    self._is_fitted = False

  def fit(self, X, y, X_new):
    self.estimators = []
    residuals = y

    if self.scaler != None:
      self.X_scaled = self.scaler.fit_transform(X)
      X_new_scaled = self.scaler.fit_transform(X_new)
    else:
      self.X_scaled = X
      X_new_scaled = X_new

    for _ in range(self.n_estimators): #loops for every estimator you want
      model1 = Lowess_AG_MD(f=self.f, iter=self.epoch, intercept=self.intercept) #merged the boosted_lwr class so it is not being used now twice
      model1.fit(self.X_scaled, residuals)
      predictions1 = model1.predict(self.X_scaled)
      residuals = residuals - predictions1

      model2 = Lowess_AG_MD(f=self.f, iter=self.epoch, intercept=self.intercept)
      model2.fit(self.X_scaled, residuals)

      self.estimators.append((model1, model2))

    self._is_fitted = True

  def is_fitted(self):
    return self._is_fitted

  def predict(self, X):
    if self.is_fitted():
      if self.scaler != None:
        self.X_new_scaled = self.scaler.transform(X)
      else:
        self.X_new_scaled = X

      predictions = []
      for model1, model2 in self.estimators:
        predict1 = model1.predict(self.X_new_scaled)
        predict2 = model2.predict(self.X_new_scaled)
        predictions.append(predict1 + predict2)

      return(predictions)

    else:
      return(self.is_fitted())

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Sophomore Year/DATA/DATA-310/Module 3/Regression Problems/Data/concrete.csv")

In [None]:
x = data.loc[:,'cement':'age'].values
y = data['strength'].values

In [None]:
xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.3,shuffle=True,random_state=123)

In [None]:
#Standard Scaler
scaler = StandardScaler()
model = GradientBoosting(scaler = scaler)
model.fit(X = xtrain, y = ytrain, X_new = xtest)
model.is_fitted()
model.predict(xtest)

In [None]:
#MinMax Scaler
scaler = MinMaxScaler()
model = GradientBoosting(scaler = scaler)
model.fit(X = xtrain, y = ytrain, X_new = xtest)
model.is_fitted()
model.predict(xtest)

In [None]:
#Quantile Scaler
scaler = QuantileTransformer()
model = GradientBoosting(scaler = scaler)
model.fit(X = xtrain, y = ytrain, X_new = xtest)
model.is_fitted()
model.predict(xtest)

In [None]:
mse_lwr = [] #Lowess
mse_xgb = [] #XGBRegressor
scale = QuantileTransformer(n_quantiles=900)

kf = KFold(n_splits=10,shuffle=True,random_state=1234)

model_xgb = XGBRegressor(objective ='reg:squarederror',n_estimators=100,reg_lambda=20,alpha=1,gamma=10,max_depth=4)

model_lwr = GradientBoosting(scaler = scaler)

for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = x[idxtest]
  xtrain = scale.fit_transform(xtrain)
  xtest = scale.transform(xtest)

  model_xgb.fit(xtrain,ytrain)
  yhat_xgb = model_xgb.predict(xtest)

  mse_xgb.append(mse(ytest,yhat_xgb))

  model_lwr.fit(X = xtrain, y = ytrain, X_new = xtest)
  yhat_lwr = model_lwr.predict(xtest)

  mse_lwr.append(ytest,yhat_lwr)

print('The Cross-validated Mean Squared Error for Locally Weighted Regression is : '+str(np.mean(mse_lwr)))
print('The Cross-validated Mean Squared Error for XGB: '+str(np.mean(mse_xgb)))

### 2) U-Search


In [None]:
class kNN:
    def __init__(self, k):
        self.k = k
        self.ss = StandardScaler()

    def fit(self, X, y):
        self.X_train = self.ss.fit_transform(X)
        self.y_train = y

    def predict(self, X):

        predictions = []

        for x_new in X:

            x_new_scaled = self.ss.transform([x_new])

            one_in_many = search(self.X_train, x_new_scaled, self.k, MetricKind.L2sq, exact=True)

            distance_list = one_in_many.to_list()

            nearest_indices = np.array(distance_list)[:, 0].astype('int64')

            nearest_distances = np.array(distance_list)[:, 1].astype(float)

            weights = 1 / nearest_distances

            weights[weights == np.inf] = 100  # Avoid division by zero

            weighted_sum = np.sum(weights)

            if weighted_sum != 0: #ensuring the weights are not zero
                predicted_value = np.sum(weights * self.y_train[nearest_indices]) / weighted_sum
            else:
                predicted_value = np.mean(self.y_train[nearest_indices]) #averaging the real y-values if the weights add to zero

            predictions.append(predicted_value)

        return predictions


In [None]:
data

x = data.loc[:,'cement':'age'].values

y = data['strength'].values

In [None]:
model = kNN(5)

In [None]:
model.fit(x, y)

In [None]:
model.predict(x)