# Linear Regression with Normal Equation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler


from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing  import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.datasets import load_boston

In [None]:
data = load_boston()

X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# split into training and test data set 
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0)

# create pipeline
reg_norm_pipe = Pipeline([

    # feature Scaling
    ('scaler', MinMaxScaler()),
    # regression
    ('norm_reg', LinearRegression())
])

# let's fit the pipeline
reg_norm_pipe.fit(X_train, y_train)

# let's get the predictions
X_train_preds = reg_norm_pipe.predict(X_train)
X_test_preds = reg_norm_pipe.predict(X_test)

# check model performance:

print(f'train mse: {mean_squared_error(y_train, X_train_preds)}')
print(f'train rmse: {sqrt(mean_squared_error(y_train, X_train_preds))}')
print(f'train r2: {r2_score(y_train, X_train_preds)}')
print()
print(f'test mse: {mean_squared_error(y_test, X_test_preds)}')
print(f'test rmse: {sqrt(mean_squared_error(y_test, X_test_preds))}')
print(f'test r2: {r2_score(y_test, X_test_preds)}')

# print regression co-effeicient 
print('regression co-efficients :',reg_norm_pipe.named_steps['norm_reg'].coef_)
print('\n intercept',reg_norm_pipe.named_steps['norm_reg'].intercept_)

scores = cross_val_score(reg_norm_pipe, X_train, y_train,cv=5)
print(scores)
print(scores.mean())

train mse: 19.640519427908043
train rmse: 4.431762564477935
train r2: 0.7697699488741149

test mse: 29.78224509230241
test rmse: 5.457311159564059
test r2: 0.6354638433202123
regression co-efficients : [-10.47489456   4.40174969  -0.15735494   2.39341594  -7.57645867
  19.67024242  -0.68311581 -15.71607313   5.52186497  -5.91977522
  -9.26413928   3.34889385 -17.59386711]

 intercept 27.492727907952286
[0.76260062 0.56956022 0.77935231 0.70774465 0.79031905]
0.721915369370549



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

# Linear Regression Stochastic Gradient Descent with Scikit Learn with GridSerach

In [None]:
data = load_boston()

X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# split into training and test data set 
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0)

# create pipeline
reg_sgd_pipe = Pipeline([

    # feature Scaling
    ('scaler', MinMaxScaler()),
    # regression
    ('sgd_reg', SGDRegressor(max_iter=10000, tol = 1e-6))
])

param_sgd = {'sgd_reg__eta0':[0.01, 0.05, 0.1 ,0.5]}
grid_sgd = GridSearchCV(reg_sgd_pipe, param_sgd,cv=5, n_jobs=-1, return_train_score = True,scoring ='neg_mean_absolute_error')

# let's fit the pipeline
grid_sgd.fit(X_train, y_train)

# let's get the predictions
X_train_preds = grid_sgd.predict(X_train)
X_test_preds = grid_sgd.predict(X_test)

# check model performance:

print(f'train mse: {mean_squared_error(y_train, X_train_preds)}')
print(f'train rmse: {sqrt(mean_squared_error(y_train, X_train_preds))}')
print(f'train r2: {r2_score(y_train, X_train_preds)}')
print()
print(f'test mse: {mean_squared_error(y_test, X_test_preds)}')
print(f'test rmse: {sqrt(mean_squared_error(y_test, X_test_preds))}')
print(f'test r2: {r2_score(y_test, X_test_preds)}')

print("Best parameters: {}".format(grid_sgd.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_sgd.best_score_))

print("Best estimator:\n{}".format(grid_sgd.best_estimator_))


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

train mse: 21.159373060735554
train rmse: 4.599931853923007
train r2: 0.7519656463544101

test mse: 32.38357507121193
test rmse: 5.6906568224776946
test r2: 0.6036234353916414
Best parameters: {'sgd_reg__eta0': 0.05}
Best cross-validation score: -3.21
Best estimator:
Pipeline(steps=[('scaler', MinMaxScaler()),
                ('sgd_reg',
                 SGDRegressor(eta0=0.05, max_iter=10000, tol=1e-06))])


##  SGD with regularization

In [None]:
data = load_boston()

X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# split into training and test data set 
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0)

# create pipeline
reg_sgd_pipe = Pipeline([

    # feature Scaling
    ('scaler', MinMaxScaler()),
    # regression
    ('sgd_reg', SGDRegressor(max_iter=10000, tol = 1e-6))
])

param_sgd = {'sgd_reg__eta0':[0.01, 0.05, 0.1 ,0.5], 'sgd_reg__penalty' :['l1','l2'],'sgd_reg__alpha' :[0.1,0.01,0.001] }
grid_sgd = GridSearchCV(reg_sgd_pipe, param_sgd,cv=5, n_jobs=-1, return_train_score = True)

# let's fit the pipeline
grid_sgd.fit(X_train, y_train)

# let's get the predictions
X_train_preds = grid_sgd.predict(X_train)
X_test_preds = grid_sgd.predict(X_test)

# check model performance:

print(f'train mse: {mean_squared_error(y_train, X_train_preds)}')
print(f'train rmse: {sqrt(mean_squared_error(y_train, X_train_preds))}')
print(f'train r2: {r2_score(y_train, X_train_preds)}')
print()
print(f'test mse: {mean_squared_error(y_test, X_test_preds)}')
print(f'test rmse: {sqrt(mean_squared_error(y_test, X_test_preds))}')
print(f'test r2: {r2_score(y_test, X_test_preds)}')

print("Best parameters: {}".format(grid_sgd.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_sgd.best_score_))

print("Best estimator:\n{}".format(grid_sgd.best_estimator_))


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

train mse: 22.621585066839636
train rmse: 4.756215414259497
train r2: 0.7348253081796532

test mse: 35.02931215908023
test rmse: 5.918556594228041
test r2: 0.5712394822474924
Best parameters: {'sgd_reg__alpha': 0.01, 'sgd_reg__eta0': 0.5, 'sgd_reg__penalty': 'l1'}
Best cross-validation score: 0.72
Best estimator:
Pipeline(steps=[('scaler', MinMaxScaler()),
                ('sgd_reg',
                 SGDRegressor(alpha=0.01, eta0=0.5, max_iter=10000,
                              penalty='l1', tol=1e-06))])


# Polynomial Regression

In [None]:
data = load_boston()

X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# split into training and test data set 
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0)

#apply polynomial regression in pipeline
#pipe_poly = make_pipeline(PolynomialFeatures(),MinMaxScaler(), LinearRegression())
pipe_poly=Pipeline([ 
    ('polynomialfeatures', PolynomialFeatures()),
    ('scaler',MinMaxScaler()),
    ('norm_reg', LinearRegression())
    
])
#define a list of parameters
param_poly = {'polynomialfeatures__degree':range(1,5)}

grid_poly = GridSearchCV(pipe_poly, param_poly,cv=5, n_jobs=-1, return_train_score = True)


grid_poly.fit(X_train, y_train)

# let's get the predictions
X_train_preds = grid_poly.predict(X_train)
X_test_preds = grid_poly.predict(X_test)

# check model performance:

print(f'train mse: {mean_squared_error(y_train, X_train_preds)}')
print(f'train rmse: {sqrt(mean_squared_error(y_train, X_train_preds))}')
print(f'train r2: {r2_score(y_train, X_train_preds)}')
print()
print(f'test mse: {mean_squared_error(y_test, X_test_preds)}')
print(f'test rmse: {sqrt(mean_squared_error(y_test, X_test_preds))}')
print(f'test r2: {r2_score(y_test, X_test_preds)}')

#find best parameters
print('Best parameters: ', grid_poly.best_params_)

print("Best cross-validation score: {:.2f}".format(grid_poly.best_score_))

# print the coefficients
print('Poly features: ', grid_poly.best_estimator_.named_steps['polynomialfeatures'].n_output_features_)
print('Coefficients: ', grid_poly.best_estimator_.named_steps['norm_reg'].coef_)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

train mse: 19.640519427908043
train rmse: 4.431762564477935
train r2: 0.7697699488741149

test mse: 29.78224509230243
test rmse: 5.45731115956406
test r2: 0.6354638433202121
Best parameters:  {'polynomialfeatures__degree': 1}
Best cross-validation score: 0.72
Poly features:  14
Coefficients:  [  0.         -10.47489456   4.40174969  -0.15735494   2.39341594
  -7.57645867  19.67024242  -0.68311581 -15.71607313   5.52186497
  -5.91977522  -9.26413928   3.34889385 -17.59386711]


# Ridge Regression

In [None]:
data = load_boston()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

ridge_pipe = Pipeline([

    # feature Scaling
    ('scaler', MinMaxScaler()),
    # regression
    ('ridge', Ridge())
])

#define a list of parameters
param_ridge = {'ridge__alpha':[0.001, 0.01, 0.1, 1, 10, 100] }

grid_ridge = GridSearchCV(ridge_pipe, param_ridge, cv=5, return_train_score = True)
grid_ridge.fit(X_train, y_train)

grid_ridge_train_score = grid_ridge.score(X_train, y_train)
grid_ridge_test_score = grid_ridge.score(X_test, y_test)

print('Training set score: ', grid_ridge_train_score)
print('Test set score: ', grid_ridge_test_score)

#find best parameters
print('best parameters:',grid_ridge.best_params_)
print('Best cross-validation score:', grid_ridge.best_score_)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

Training set score:  0.7656443349364728
Test set score:  0.6214583227921031
best parameters: {'ridge__alpha': 1}
Best cross-validation score: 0.7244032269386581


# Lasso Regression

In [None]:
data = load_boston()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

lasso_pipe = Pipeline([

    # feature Scaling
    ('scaler', MinMaxScaler()),
    # regression
    ('lasso', Lasso(max_iter=1000, tol = 1e-5))
])

#define a list of parameters
param_lasso = {'lasso__alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10] }

grid_lasso = GridSearchCV(lasso_pipe, param_lasso, cv=5, return_train_score = True)
grid_lasso.fit(X_train, y_train)

grid_lasso_train_score = grid_lasso.score(X_train, y_train)
grid_lasso_test_score = grid_lasso.score(X_test, y_test)

print('Training set score: ', grid_lasso_train_score)
print('Test score: ', grid_lasso_test_score)

#find best parameters
print('Best parameters: ', grid_lasso.best_params_)
print('Best cross-validation score:', grid_lasso.best_score_)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

Training set score:  0.7687862301878807
Test score:  0.6310389471799465
Best parameters:  {'lasso__alpha': 0.01}
Best cross-validation score: 0.7230563618131272


# ElasticNet 

In [None]:
data = load_boston()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

elasticnet = Pipeline([

    # feature Scaling
    ('scaler', MinMaxScaler()),
    # regression
    ('elastic', ElasticNet())
])

#define a list of parameters
param_elasticnet = {'elastic__alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'elastic__l1_ratio' :[0.2,0.4,0.6,0.8]}

grid_elasticnet = GridSearchCV(elasticnet , param_elasticnet, cv=5, return_train_score = True)
grid_elasticnet.fit(X_train, y_train)

grid_elasticnet_train_score = grid_elasticnet.score(X_train, y_train)
grid_elasticnet_test_score = grid_elasticnet.score(X_test, y_test)

print('Training set score: ', grid_elasticnet_train_score)
print('Test score: ', grid_elasticnet_test_score)

#find best parameters
print('Best parameters: ', grid_elasticnet.best_params_)
print('Best cross-validation score:', grid_elasticnet.best_score_)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

Training set score:  0.7650885772611327
Test score:  0.6204728438000011
Best parameters:  {'elastic__alpha': 0.01, 'elastic__l1_ratio': 0.8}
Best cross-validation score: 0.7239800576089702


# Logistic Regression

In [None]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

scaler = MinMaxScaler()
pipe_logreg=Pipeline([ 
    ('scaler',MinMaxScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
    
])
# define a list of parameters

param_logreg = {'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

#apply grid search
grid_logreg = GridSearchCV(pipe_logreg, param_logreg, cv=5, return_train_score=True)
grid_logreg.fit(X_train, y_train)

print('train score: ', grid_logreg.score(X_train, y_train))
print('test score: ', grid_logreg.score(X_test, y_test))

#find best parameters
print('Best parameters: ', grid_logreg.best_params_)
print('Best cross-validation score:', grid_logreg.best_score_)


train score:  0.9824175824175824
test score:  0.9649122807017544
Best parameters:  {'logreg__C': 10}
Best cross-validation score: 0.9802197802197803


# Softmax Regression

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()

In [None]:
X = iris["data"][:, (2, 3)]  # petal length, petal width
y = iris["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

softmax_reg = LogisticRegression(multi_class="multinomial",solver="lbfgs", C=10)
softmax_reg.fit(X_train, y_train)

LogisticRegression(C=10, multi_class='multinomial')

In [None]:
print('Train score: {:.4f}'.format(softmax_reg.score(X_train, y_train)))
print('Test score: {:.4f}'.format(softmax_reg.score(X_test, y_test)))

Train score: 0.9643
Test score: 0.9737


In [None]:
softmax_reg.predict([[5, 2]])

array([2])

In [None]:
softmax_reg.predict_proba([[5, 2]])

array([[1.33384149e-06, 7.90876321e-02, 9.20911034e-01]])