<a href="https://colab.research.google.com/github/arpit4101/Machine-Learning-Concepts/blob/main/Regularisation/Lasso/Assignment_Task_55_LASSO_RIDGE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### `Task` Train a Lasso And Ridge Regression model on Boston House Price Datset. And also show how it reduces overfitting in compare to normal Linear regression model.

Data set Link - https://www.kaggle.com/datasets/altavish/boston-housing-dataset

DataSet description:
```
ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS: proportion of non-retail business acres per town
CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
NOX: nitric oxides concentration (parts per 10 million)
RM: average number of rooms per dwelling
AGE: proportion of owner-occupied units built prior to 1940
DIS: weighted distances to ﬁve Boston employment centers
RAD: index of accessibility to radial highways
TAX: full-value property-tax rate per $10,000
PTRATIO: pupil-teacher ratio by town 12. B: 1000(Bk−0.63)2 where Bk is the proportion of blacks by town 13. LSTAT: % lower status of the population
MEDV: Median value of owner-occupied homes in $1000s (Target)
```

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/HousingData.csv.xls')
df.head()


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [None]:
df.isna().sum()

df.interpolate(inplace=True)
df.isna().sum()

Unnamed: 0,0
CRIM,0
ZN,0
INDUS,0
CHAS,0
NOX,0
RM,0
AGE,0
DIS,0
RAD,0
TAX,0


In [None]:
df.shape

(506, 14)

In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

# checking Linear regression model score to determine the overfitting

In [None]:
#Splitting data into training & test sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state =13)
X_train.shape

(404, 13)

In [None]:
#Checking the diffrence in MSE to determine overfitting

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_squared_error

lr = LinearRegression()

lr.fit(X_train, y_train)

print('MSE for train data : ', mean_squared_error(y_train, lr.predict(X_train)))
print('MSE for test data : ', mean_squared_error(y_test, lr.predict(X_test)))
print('R2 score for train data : ', r2_score(y_train, lr.predict(X_train)))
print('R2 score for test data : ', r2_score(y_test, lr.predict(X_test)))

MSE for train data :  21.969590903679748
MSE for test data :  24.07536692195306
R2 score for train data :  0.7336429226009785
R2 score for test data :  0.7344455909125649


In [None]:
# Standardizing Features because Lasso is sensitive to the scale of features
from sklearn.preprocessing import StandardScaler

columns = X_train.columns

sc = StandardScaler()
X_train = sc.fit_transform(X_train,y_train)
X_test = sc.transform(X_test)



In [None]:
X_train = pd.DataFrame(X_train, columns = columns)
X_test = pd.DataFrame(X_test, columns = columns)

# Now, we have to tune the hyperparameter lambda/alpha

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

#define a range of alphas to test
alpha_grid = {'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 30]}

lasso = Lasso()

#setup GridSearchCV
grid_search = GridSearchCV(lasso, alpha_grid, scoring='neg_mean_squared_error', cv=5 )
grid_search.fit(X_train, y_train)

print('best alpha : ', grid_search.best_params_)
best_alpha = grid_search.best_params_['alpha']

best alpha :  {'alpha': 0.01}


# Applying Lasso

In [None]:
lasso = Lasso(alpha = best_alpha)

lasso.fit(X_train, y_train)

y_pred = lasso.predict(X_test)

print("Lasso's MSE : ", mean_squared_error(y_test, y_pred))
print("R2 score : ", r2_score(y_test, y_pred))

Lasso's MSE :  23.93729192228684
R2 score :  0.7359685760062075


In [None]:
print('MSE for train data : ', mean_squared_error(y_train, lasso.predict(X_train)))
print('MSE for test data : ', mean_squared_error(y_test, lasso.predict(X_test)))
print('R2 score for train data : ', r2_score(y_train, lasso.predict(X_train)))
print('R2 score for test data : ', r2_score(y_test, lasso.predict(X_test)))

MSE for train data :  21.97457129198944
MSE for test data :  23.93729192228684
R2 score for train data :  0.7335825408815237
R2 score for test data :  0.7359685760062075


# Applying Ridge

In [None]:
#tuning hyperparameter alpha/lambda
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

param_grid = {'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 30, 100]}

ridge = Ridge()

#setting up GridSearchCV

grid_search1 = GridSearchCV(ridge, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search1.fit(X_train, y_train)

best_alpha1 = grid_search1.best_params_['alpha']
print(best_alpha1)


10


In [None]:
ridge = Ridge(alpha=best_alpha1)

ridge.fit(X_train, y_train)

print('MSE for train data : ', mean_squared_error(y_train, ridge.predict(X_train)))
print('MSE for test data : ', mean_squared_error(y_test, ridge.predict(X_test)))
print('R2 score for train data : ', r2_score(y_train, ridge.predict(X_train)))
print('R2 score for test data : ', r2_score(y_test, ridge.predict(X_test)))

MSE for train data :  22.09442969881625
MSE for test data :  23.49153772897863
R2 score for train data :  0.7321293897926364
R2 score for test data :  0.7408853023757761


### `Task` perform Lasso and Ridge in Wine Quality dataset and show how it's different from Linear Regression.

Data link : https://docs.google.com/spreadsheets/d/e/2PACX-1vQDVwxneOKOaJL13QMhkAhYrgWlH1tICY7RacUnj_lL8m9uUWaaUf3p7bScNyh_D2Rvt7nc1q11adSy/pub?gid=647503637&single=true&output=csv



In [None]:
# Loading Wine dataset
import pandas as pd
wine_data_path = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQDVwxneOKOaJL13QMhkAhYrgWlH1tICY7RacUnj_lL8m9uUWaaUf3p7bScNyh_D2Rvt7nc1q11adSy/pub?gid=647503637&single=true&output=csv"
data = pd.read_csv(wine_data_path)

In [None]:
data.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


**Encoding Categorical Vaiables**

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data['type'] = le.fit_transform(data['type'])

In [None]:
data.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,1,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,1,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,1,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,1,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


**checking missing values & filling them**

In [None]:
data.isna().sum()

Unnamed: 0,0
type,0
fixed acidity,10
volatile acidity,8
citric acid,3
residual sugar,2
chlorides,2
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,9


In [None]:
data.interpolate(method = 'linear', inplace=True)
data.isna().sum()

Unnamed: 0,0
type,0
fixed acidity,0
volatile acidity,0
citric acid,0
residual sugar,0
chlorides,0
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,0


In [None]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
columns = X.columns

In [None]:
X.shape

(6497, 12)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=3)

In [None]:
X.shape

(6497, 12)

***Applying Logistic Regression***

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lgr = LogisticRegression(max_iter = 500)
lgr.fit(X_train,y_train)

print('Accuracy score for train data : ', accuracy_score(y_train,lgr.predict(X_train)))
print('Accuracy score for test data : ', accuracy_score(y_test,lgr.predict(X_test)))


Accuracy score for train data :  0.5170290552241678
Accuracy score for test data :  0.5146153846153846


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**scaling the features**

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train,y_train)
X_test = sc.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns=columns)
X_test = pd.DataFrame(X_test, columns=columns)

In [None]:
X_train.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-1.731385,1.932778,-0.272904,0.566323,-0.827035,0.454099,-0.920191,-1.6531,0.287821,-0.48631,-0.147002,0.346929
1,-1.731385,2.47776,0.550552,1.254529,-0.659307,0.317101,-0.976755,-1.688271,0.434745,-0.735332,2.256676,0.766443
2,-1.731385,0.920668,2.28896,-0.810089,-0.638341,0.536297,-1.146448,-1.35415,0.885536,0.572033,0.720993,-0.91161
3,-1.731385,1.621359,-0.425396,0.635144,-0.722205,0.947291,-1.372706,-1.741027,1.102583,0.385266,1.121606,-0.659902
4,0.577572,-0.091442,1.282513,-1.291833,-0.785103,0.454099,-0.52424,0.879182,-0.233093,-1.544653,0.32038,-1.079416


## **Applying Ridge Classifier**

***Tuning Hyperparameter Lambda/alpha***

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5,shuffle=True,random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier

ridgeC = RidgeClassifier()

alphas = np.logspace(-2,2,100) # takes 100 values from 0.01 to 100

param_grid = {'alpha' : alphas}

grid_search = GridSearchCV(ridgeC, param_grid, scoring = 'accuracy', cv=kf, n_jobs=-1)

grid_search.fit(X_train, y_train)

grid_search.best_params_

{'alpha': np.float64(24.770763559917114)}

In [None]:
best_alpha = grid_search.best_params_['alpha']

In [None]:
ridge_classifier = RidgeClassifier(alpha= best_alpha, max_iter=500 )

ridge_classifier.fit(X_train, y_train)
print('Accuracy score for train data : ', accuracy_score(y_train,ridge_classifier.predict(X_train)))
print('Accuracy score for test data : ', accuracy_score(y_test,ridge_classifier.predict(X_test)))

Accuracy score for train data :  0.5372330190494516
Accuracy score for test data :  0.5223076923076924


# *Appying Lasso*

tuning hyperparameter alpha/lambda

In [None]:
from sklearn.linear_model import LassoCV


lasso1 = LassoCV(max_iter=1000, cv=5, random_state=42)

lasso1.fit(X_train, y_train)

best_alpha = lasso1.alpha_
print('best alpha : ', best_alpha)

lasso_logistic = LogisticRegression(penalty='l1', solver='liblinear', C=1/best_alpha)
lasso_logistic.fit(X_train, y_train)

print('Accuracy score for train data : ', accuracy_score(y_train,lasso_logistic.predict(X_train)))
print('Accuracy score for test data : ', accuracy_score(y_test,lasso_logistic.predict(X_test)))


best alpha :  0.00039009188237179706
Accuracy score for train data :  0.5487781412353281
Accuracy score for test data :  0.5292307692307693
