<a href="https://colab.research.google.com/github/angelaaaateng/ftw_python/blob/main/AdvanceRegressionTechniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Regularization reduces model complexity and prevents overfitting.

# Ridge Regression (L2 Regularization)

Use when independent variables are highly correlated (multicollinearity)

In [None]:
 from sklearn.linear_model import Ridge
 import numpy as np
 n_samples, n_features = 10, 5
 rng = np.random.RandomState(0)
 y = rng.randn(n_samples)
 X = rng.randn(n_samples, n_features)
 clf = Ridge(alpha=1.0)
 clf.fit(X, y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

# Lasso Regression (L1 Regularization)

Works well for feature selection in case we have a huge number of features.

In [None]:
from sklearn.linear_model import Lasso
clf = Lasso(alpha=0.1)
clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
Lasso(alpha=0.1)
print(clf.coef_)
print(clf.intercept_)

[0.85 0.  ]
0.15000000000000002


# ElasticNet Regression (L1+L2 regularization)

Mix of both regularization . Robust to production but to tend to have poorer metrics.

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
X, y = make_regression(n_features=2, random_state=0)
regr = ElasticNet(random_state=0)
regr.fit(X, y)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=0, selection='cyclic', tol=0.0001, warm_start=False)

# Instance-Based Regression
builds up a database of example data and compare new data to the database using a similarity measure in order to find the best match and make a prediction

## k-Nearest Neighbor (kNN)


In [None]:
X = [[0], [1], [2], [3]]
y = [0, 0, 1, 1]
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X, y)
print(neigh.predict([[1.5]]))

[0.5]


## Support Vector Machines

In [None]:
from sklearn.svm import SVR
import numpy as np
n_samples, n_features = 10, 5
rng = np.random.RandomState(0)
y = rng.randn(n_samples)
X = rng.randn(n_samples, n_features)
clf = SVR(C=1.0, epsilon=0.2)
clf.fit(X, y)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

## Decision Tree Regression

Interpretable, robust to multicollinearity and variance.

Often fast and accurate and a big favorite in machine
learning. 

In [None]:
from sklearn.datasets import load_iris
from sklearn import tree

X, y = load_iris(return_X_y=True)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)

# Ensemble Methods Regression
Composed of multiple weaker models that are
independently trained and whose predictions are
combined in some way to make the overall prediction.

### Random Forest

uses multiple decision trees and obtain a vote from each decision tree 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False)
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X, y)
print(regr.predict([[0, 0, 0, 0]]))

[0.18146984 0.81473937 0.00145312 0.00233767]
[-8.32987858]


### XGBoost
Unlike Random Forest where it trains each decision tree individually. Each new model being trained in XGBoost corrects the errors made by the previous ones.

In [None]:
# import libraries
from sklearn import datasets
import xgboost as xgb

# load date
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [None]:
# split training and test data
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# transform data to XGBoost specific format called DMatrix
D_train = xgb.DMatrix(X_train, label=Y_train)
D_test = xgb.DMatrix(X_test, label=Y_test)

In [None]:
# define XGBoost model
param = {
    'eta': 0.3, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 3} 

steps = 20  # The number of training iterations

In [None]:
# train model
model = xgb.train(param, D_train, steps)

In [None]:
# test model
preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])

In [None]:
print(best_preds)

[2 2 2 0 1 2 2 2 0 2 1 2 0 2 1 2 0 0 0 1 1 0 1 0 1 0 1 0 0 2]
