<a href="https://colab.research.google.com/github/aegisen/DATA441/blob/main/HW2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial Setup (Imports, Supporting Functions)

In [132]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from scipy.spatial import Delaunay
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from scipy import linalg
from scipy.interpolate import interp1d, LinearNDInterpolator, NearestNDInterpolator
from sklearn.decomposition import PCA

from scipy.spatial.distance import cdist

# the following line(s) are necessary if you want to make SKlearn compliant functions
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

import xgboost


In [133]:
!pip install usearch



In [134]:
# Gaussian Kernel
def Gaussian(w):
  return np.where(w>4,0,1/(np.sqrt(2*np.pi))*np.exp(-1/2*w**2))

# Tricubic Kernel
def Tricubic(w):
  return np.where(w>1,0,70/81*(1-w**3)**3)

# Quartic Kernel
def Quartic(w):
  return np.where(w>1,0,15/16*(1-w**2)**2)

# Epanechnikov Kernel
def Epanechnikov(w):
  return np.where(w>1,0,3/4*(1-w**2))


In [135]:
data = pd.read_csv('drive/MyDrive/Adv. Appl. Machine Learning/data/cars.csv')
data

Unnamed: 0,MPG,CYL,ENG,WGT
0,18.0,8,307.0,3504
1,15.0,8,350.0,3693
2,18.0,8,318.0,3436
3,16.0,8,304.0,3433
4,17.0,8,302.0,3449
...,...,...,...,...
387,27.0,4,140.0,2790
388,44.0,4,97.0,2130
389,32.0,4,135.0,2295
390,28.0,4,120.0,2625


In [136]:
def dist(u,v):
  if len(v.shape)==1:
    v = v.reshape(1,-1)
  d = np.array([np.sqrt(np.sum((u-v[i])**2,axis=1)) for i in range(len(v))])
  return d

In [137]:
scale = StandardScaler()

In [138]:
def weight_function(u,v,kern=Gaussian,tau=0.5):
    return kern(cdist(u, v, metric='euclidean')/(2*tau))

# 1. Boosting with Lowess

In [139]:
class Lowess:
    def __init__(self, kernel = Gaussian, tau=0.05):
        self.kernel = kernel
        self.tau = tau

    def fit(self, x, y):
        kernel = self.kernel
        tau = self.tau
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        lm = linear_model.Ridge(alpha=0.0001)
        w = weight_function(x,x_new,self.kernel,self.tau)

        if np.isscalar(x_new):
          lm.fit(np.diag(w)@(x.reshape(-1,1)),np.diag(w)@(y.reshape(-1,1)))
          yest = lm.predict([[x_new]])[0][0]
        else:
          n = len(x_new)
          yest_test = []
          #Looping through all x-points
          for i in range(n):
            lm.fit(np.diag(w[:,i])@x,np.diag(w[:,i])@y)
            yest_test.append(lm.predict([x_new[i]]))
        return np.array(yest_test).flatten()


In [140]:
# Scalers

quantileScaler = QuantileTransformer(n_quantiles=10, random_state=445)
stdScaler = StandardScaler()
mmScaler = MinMaxScaler()

In [141]:
# Number of boosting steps
steps = 5

# Hold models
models = []

# Setup lists to hold MSEs for each Scaler
quant_mse_lwr = []
quant_mse_rf = []

std_mse_lwr = []
std_mse_rf = []

mm_mse_lwr = []
mm_mse_rf = []

# Iterate through this list to track MSE for each scaler
SCALERS = [[quantileScaler, quant_mse_lwr, quant_mse_rf],
           [stdScaler, std_mse_lwr, std_mse_rf],
           [mmScaler, mm_mse_lwr, mm_mse_rf]]

# Set X and Y
x = data.loc[:,"CYL":"WGT"].values
y = data['MPG'].values

kf = KFold(n_splits=10,shuffle=True,random_state=445)

# Setup models
#model_rf = XGBRegressor(objective ='reg:squarederror',n_estimators=100,reg_lambda=20,alpha=1,gamma=10,max_depth=4)
model_rf = xgboost.XGBRFRegressor(n_estimators=150,max_depth=steps)
model_1 = Lowess(kernel= Epanechnikov,tau=0.14)
model_2 = Lowess(kernel= Gaussian)


# Do KFold split
for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain].ravel()
  ytest = y[idxtest].ravel()
  xtest = x[idxtest]

  # Try different scalers + track MSEs
  for scalerSet in SCALERS:
    scaled_xtrain = scalerSet[0].fit_transform(xtrain)
    scaled_xtest = scalerSet[0].transform(xtest)

    model_1.fit(scaled_xtrain, ytrain)
    yhat_train = model_1.predict(scaled_xtrain)

    residuals_train = ytrain - yhat_train

    model_2.fit(scaled_xtrain, residuals_train)


    # If more than one boosting step,
    if steps > 1:
      # Continue training on train data
      residuals_hat = model_2.predict(scaled_xtrain)
      ypred_temp = model_1.predict(scaled_xtrain) + model_2.predict(scaled_xtrain)

      # Continue training based off residuals
      for step in range(steps-1):
        new_residuals = ytrain - ypred_temp
        model_2.fit(scaled_xtrain, new_residuals)
        ypred_temp = model_1.predict(scaled_xtrain) + model_2.predict(scaled_xtrain)

    # Predict on test data
    yhat_lw = model_1.predict(scaled_xtest) + model_2.predict(scaled_xtest)

    model_rf.fit(scaled_xtrain,ytrain)
    yhat_rf = model_rf.predict(scaled_xtest)

    # Track MSE for each scaler
    scalerSet[1].append(mse(ytest,yhat_lw))
    scalerSet[2].append(mse(ytest,yhat_rf))


print("** QUANTILE SCALE **\nThe Cross-validated Mean Squared Error for Locally Weighted Regression is : "+str(np.mean(SCALERS[0][1])))
print("\nThe Cross-validated Mean Squared Error for a DT-based method: "+str(np.mean(SCALERS[0][2])))

print("\n\n")

print("** STANDARD SCALE **\nThe Cross-validated Mean Squared Error for Locally Weighted Regression is : "+str(np.mean(SCALERS[1][1])))
print("\nThe Cross-validated Mean Squared Error for a DT-based method: "+str(np.mean(SCALERS[1][2])))

print("\n\n")

print("** MINMAX SCALE **\nThe Cross-validated Mean Squared Error for Locally Weighted Regression is : "+str(np.mean(SCALERS[2][1])))
print("\nThe Cross-validated Mean Squared Error for a DT-based method: "+str(np.mean(SCALERS[2][2])))

KeyboardInterrupt: 

# 2. USearch KNN Regression

## Preliminary Steps (imports)

In [142]:
from usearch.index import search, MetricKind, Matches, BatchMatches
from sklearn.model_selection import train_test_split


## Main

In [143]:
# Generate 10'000 random vectors with 1024 dimensions
vectors = np.random.rand(10000, 1024).astype(np.float32)
vector = np.random.rand(1024).astype(np.float32)

one_in_many: Matches = search(vectors, vector, 50, MetricKind.L2sq, exact=True)
# many_in_many: BatchMatches = search(vectors, vectors, 50, MetricKind.L2sq, exact=True)


In [144]:
x = data.loc[:,"CYL":"WGT"].values
y = data['MPG'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=445)

In [145]:
x

array([[   8.,  307., 3504.],
       [   8.,  350., 3693.],
       [   8.,  318., 3436.],
       ...,
       [   4.,  135., 2295.],
       [   4.,  120., 2625.],
       [   4.,  119., 2720.]])

In [146]:
many_in_many: BatchMatches = search(x_train, x_test, 3, MetricKind.L2sq, exact=True)

In [147]:
output = many_in_many.to_list()
output

[(130, 100.0),
 (2, 1922.0),
 (185, 3226.0),
 (144, 205.0),
 (204, 4280.0),
 (115, 4644.0),
 (128, 36.0),
 (39, 82.0),
 (161, 425.0),
 (22, 221.0),
 (182, 296.0),
 (237, 356.0),
 (139, 49.0),
 (96, 281.0),
 (183, 360.0),
 (99, 61.0),
 (242, 100.0),
 (17, 169.0),
 (157, 484.0),
 (205, 1764.0),
 (119, 2809.0),
 (37, 26.0),
 (82, 36.0),
 (136, 58.25),
 (91, 373.0),
 (218, 533.0),
 (105, 776.0),
 (205, 356.0),
 (157, 1156.0),
 (102, 1965.0),
 (47, 20449.0),
 (12, 34514.0),
 (172, 38746.0),
 (220, 0.0),
 (242, 58.0),
 (17, 89.0),
 (109, 37.0),
 (83, 436.0),
 (207, 1070.0),
 (144, 65.0),
 (204, 2438.0),
 (115, 3078.0),
 (92, 388.0),
 (166, 818.0),
 (95, 925.0),
 (185, 1345.0),
 (130, 1649.0),
 (2, 2117.0),
 (87, 625.0),
 (150, 972.0),
 (79, 1388.0),
 (189, 104.0),
 (64, 180.0),
 (165, 244.0),
 (179, 1902.0),
 (227, 2998.0),
 (222, 3205.0),
 (77, 1369.0),
 (40, 2525.0),
 (66, 2777.0),
 (40, 400.0),
 (86, 626.0),
 (44, 1508.0),
 (212, 3796.0),
 (233, 5200.0),
 (157, 6760.0),
 (15, 306.0),
 (25

In [148]:
ind = np.array(output)[:,0].astype('int64')
dist = np.array(output)[:,1].astype(float)

weights = 1/dist[5:10]
weights[weights==np.inf]=100
np.sum(weights)
weights = weights/np.sum(weights)

#weights

one_in_many.to_list()


[(4715, 150.3211669921875),
 (2724, 151.3099822998047),
 (9974, 151.51263427734375),
 (5361, 151.82955932617188),
 (1910, 151.96566772460938),
 (8897, 152.19728088378906),
 (7180, 152.3291015625),
 (5165, 153.63076782226562),
 (8393, 154.0254669189453),
 (5225, 154.2865447998047),
 (1460, 154.8953399658203),
 (5450, 155.20074462890625),
 (9267, 155.21224975585938),
 (79, 155.463623046875),
 (180, 155.49050903320312),
 (6760, 155.69512939453125),
 (1247, 155.71969604492188),
 (141, 155.9477081298828),
 (8651, 156.06082153320312),
 (9746, 156.1276092529297),
 (7296, 156.48178100585938),
 (7568, 156.58126831054688),
 (589, 156.6236572265625),
 (8459, 156.70033264160156),
 (2734, 156.718017578125),
 (6100, 156.85723876953125),
 (6966, 156.92471313476562),
 (795, 156.94361877441406),
 (3815, 157.16461181640625),
 (6054, 157.22767639160156),
 (3889, 157.4033203125),
 (3003, 157.40594482421875),
 (5189, 157.42568969726562),
 (406, 157.4899444580078),
 (486, 157.50656127929688),
 (3735, 157.61

In [149]:
dist

array([1.0000e+02, 1.9220e+03, 3.2260e+03, 2.0500e+02, 4.2800e+03,
       4.6440e+03, 3.6000e+01, 8.2000e+01, 4.2500e+02, 2.2100e+02,
       2.9600e+02, 3.5600e+02, 4.9000e+01, 2.8100e+02, 3.6000e+02,
       6.1000e+01, 1.0000e+02, 1.6900e+02, 4.8400e+02, 1.7640e+03,
       2.8090e+03, 2.6000e+01, 3.6000e+01, 5.8250e+01, 3.7300e+02,
       5.3300e+02, 7.7600e+02, 3.5600e+02, 1.1560e+03, 1.9650e+03,
       2.0449e+04, 3.4514e+04, 3.8746e+04, 0.0000e+00, 5.8000e+01,
       8.9000e+01, 3.7000e+01, 4.3600e+02, 1.0700e+03, 6.5000e+01,
       2.4380e+03, 3.0780e+03, 3.8800e+02, 8.1800e+02, 9.2500e+02,
       1.3450e+03, 1.6490e+03, 2.1170e+03, 6.2500e+02, 9.7200e+02,
       1.3880e+03, 1.0400e+02, 1.8000e+02, 2.4400e+02, 1.9020e+03,
       2.9980e+03, 3.2050e+03, 1.3690e+03, 2.5250e+03, 2.7770e+03,
       4.0000e+02, 6.2600e+02, 1.5080e+03, 3.7960e+03, 5.2000e+03,
       6.7600e+03, 3.0600e+02, 1.7640e+03, 2.4040e+03, 5.8600e+02,
       2.9160e+03, 3.5090e+03, 5.0000e+01, 1.2500e+02, 3.5600e

In [150]:
weights

array([0.00457509, 0.59018703, 0.2591065 , 0.04999231, 0.09613906])

In [151]:
x

array([[   8.,  307., 3504.],
       [   8.,  350., 3693.],
       [   8.,  318., 3436.],
       ...,
       [   4.,  135., 2295.],
       [   4.,  120., 2625.],
       [   4.,  119., 2720.]])

## Hmmm

In [152]:
from usearch.index import Index



In [160]:
class USearchKNN:
  def __init__(self, k = 3):
    self.k = k
    self.index = None

  def fit(self, x, y):
    self.x = x
    self.y = y

    self.index = Index(ndim = self.x.shape[1], metric = MetricKind.Cos, expansion_search=64)
    self.index.add(np.arange(len(self.x)), self.x)

  def predict(self, x):
    preds = []
    for item in x:
      pair = self.index.search(item.reshape(1, -1), count = self.k)
      index = pair.keys

      pred = np.mean(self.y[index])
      preds.append(pred)

    return preds

In [154]:
x

array([[   8.,  307., 3504.],
       [   8.,  350., 3693.],
       [   8.,  318., 3436.],
       ...,
       [   4.,  135., 2295.],
       [   4.,  120., 2625.],
       [   4.,  119., 2720.]])

In [None]:
y

In [156]:
# Setup Data
x = data.loc[:,"CYL":"WGT"].values
y = data['MPG'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=445)

In [161]:
# Train model and predict
usearch = USearchKNN()

usearch.fit(x_train, y_train)

y_pred = usearch.predict(x_test)

print("MSE: ", mse(y_pred, y_test))

MSE:  40.74267493691195
