# Testing out minimal numpy implementations of ML algorithms

In [1]:
import numpy as np
import pandas as pd

import sklearn.datasets as skd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

np.set_printoptions(precision=4)
pd.options.display.float_format = '{:6,.2f}'.format

In [2]:
import numpy_ml as nml

## regression

### well posed

In [7]:
X, y, b = skd.make_regression(n_samples=1000, n_features=10,n_informative=5, coef=True)
pd.DataFrame(
np.c_[b,
      nml.linear_models.LinearRegression(fit_intercept=False).fit(X, y).beta.flatten(),
      np.linalg.lstsq(X, y, rcond=None)[0],
      nml.linear_models.BayesianLinearRegressionUnknownVariance(fit_intercept=False).fit(X, y).posterior['b | sigma**2'].mean,
      nml.linear_models.RidgeRegression(alpha=1.0, fit_intercept=False).fit(X, y).beta,
    ],
    columns=['True', 'nlm', 'np_lstsq', 'bayes', 'ridge']
)

Unnamed: 0,True,nlm,np_lstsq,bayes,ridge
0,18.72,18.72,18.72,18.71,18.71
1,89.91,89.91,89.91,89.81,89.81
2,3.98,3.98,3.98,3.98,3.98
3,0.0,0.0,-0.0,-0.0,-0.0
4,0.0,-0.0,-0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,-0.0,-0.0
7,17.7,17.7,17.7,17.68,17.68
8,0.0,0.0,0.0,-0.01,-0.01
9,72.07,72.07,72.07,71.99,71.99


different ols solutions are identical.  ridge and bayes produce the same answer.

### ill posed

In [4]:
X, y, b = skd.make_regression(n_samples=100, n_features=2000,n_informative=100, coef=True)
min_norm_sol = pd.DataFrame(
np.c_[b,
      nml.linear_models.LinearRegression(fit_intercept=False).fit(X, y).beta.T.flatten(),
      np.linalg.lstsq(X, y, rcond=None)[0]
    ], columns = ['true_coef', 'np1', 'np2']
)

np.linalg.norm(min_norm_sol.true_coef), np.linalg.norm(min_norm_sol.np1), np.linalg.norm(min_norm_sol.np2)

(616.7496712030379, 139.11304600660412, 139.08121952806906)

Minimum norm interpolant estimated via pseudoinverse.

### nonlinear

```
y(X) = arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]) + noise * N(0, 1).
```

In [8]:
X, y = skd.make_friedman3(n_samples=1000, noise=0.1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
def estimate_mse(estimator):
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    return mean_squared_error(y_test, y_pred)

In [9]:
{
    'LinearRegression': estimate_mse(nml.linear_models.LinearRegression(fit_intercept=False)),
    'RidgeRegression': estimate_mse(nml.linear_models.RidgeRegression(alpha=1.0, fit_intercept=False)),
    'KernelRegression': estimate_mse(nml.nonparametric.KernelRegression(kernel='rbf')),
    'RandomForest': estimate_mse(nml.trees.RandomForest(100, 4, 4,
                                                        classifier=False, criterion='mse',
                                                        )),
    'GradientBoosting': estimate_mse(nml.trees.GradientBoostedDecisionTree(classifier=False,
                                                                           n_iter=100, loss='mse',
                                                                        )),
}

{'LinearRegression': 0.12265230895675583,
 'RidgeRegression': 0.12281528055071242,
 'KernelRegression': 0.19501779804529262,
 'RandomForest': 0.020847490024374363,
 'GradientBoosting': 0.0830203419901581}