In [35]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures

In [36]:
concrete_compressive_strength = fetch_ucirepo(id=165) 
X = concrete_compressive_strength.data.features 
y = concrete_compressive_strength.data.targets
lr = LinearRegression()
lr.fit(X, y)
lr.intercept_, lr.coef_

(array([-23.33121358]),
 array([[ 0.11980433,  0.10386581,  0.08793432, -0.14991842,  0.2922246 ,
          0.01808621,  0.02019035,  0.11422207]]))

In [37]:
ridge = Ridge()
ridge.fit(X, y)
ridge.intercept_, ridge.coef_

(array([-23.32957301]),
 array([[ 0.11980439,  0.10386586,  0.08793476, -0.14992243,  0.2922038 ,
          0.01808557,  0.0201901 ,  0.11422205]]))

Linear Regression

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, test_size=0.3)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)


In [39]:
r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred)

(0.5771752777048793, np.float64(109.05454495326019))

Ridge

In [40]:
ridge = Ridge()
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)


In [41]:
r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred)

(0.5771749099675625, np.float64(109.05463979971461))

Ridge + Polynomial Features

In [42]:
poly = PolynomialFeatures(degree=2, include_bias=False).set_output(transform='pandas')
X_poly_trn = poly.fit_transform(X_train)
X_poly_tst = poly.transform(X_test)
ridge.fit(X_poly_trn, y_train)
y_pred = ridge.predict(X_poly_tst)

In [43]:
r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred)

(0.7787728514690129, np.float64(57.05869298133164))

In [44]:
poly = PolynomialFeatures(degree=3,include_bias=False).set_output(transform='pandas')
X_poly_trn = poly.fit_transform(X_train)
X_poly_tst = poly.transform(X_test)
ridge.fit(X_poly_trn, y_train)
y_pred = ridge.predict(X_poly_tst)


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [45]:
r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred)

(0.86950650692776, np.float64(33.65675599361417))

Filtered the extracted features based on coefficient values

In [46]:
df_coef = pd.DataFrame({'col_names': list(X_poly_trn.columns),
                        'coef':list(ridge.coef_[0])})

In [47]:
print(df_coef.shape)
df_coef[df_coef['coef'] > 0.0001]

(164, 2)


Unnamed: 0,col_names,coef
1,Blast Furnace Slag,0.636145
5,Coarse Aggregate,2.612272
6,Fine Aggregate,0.909095
7,Age,0.123619
8,Cement^2,0.010445
9,Cement Blast Furnace Slag,0.031224
10,Cement Fly Ash,0.008666
14,Cement Fine Aggregate,0.023399
16,Blast Furnace Slag^2,0.004927
17,Blast Furnace Slag Fly Ash,0.015132


Considering different values of *alpha*

In [14]:
ridge = Ridge(alpha=0.22)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
r2_score(y_test, y_pred)

0.5771751967997469

Tuning for alpha = [0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 4, 10]

In [48]:
alphas = [0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 4, 10]
scores = []
for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append(r2_score(y_test, y_pred))
print(scores)

[0.5771752740273375, 0.5771752409296147, 0.5771751673801078, 0.577175057058402, 0.5771749099675625, 0.5771747261116775, 0.5771745422643093, 0.5771738069600132, 0.5771716018651836]


In [49]:
i_max = np.argmax(scores)
print("Best alpha:", alphas[i_max])
print("Best Score:", scores[i_max])

Best alpha: 0.01
Best Score: 0.5771752740273375


Tuning for alpha

In [19]:
np.linspace(0.0001, 10, 20)

array([1.00000000e-04, 5.26410526e-01, 1.05272105e+00, 1.57903158e+00,
       2.10534211e+00, 2.63165263e+00, 3.15796316e+00, 3.68427368e+00,
       4.21058421e+00, 4.73689474e+00, 5.26320526e+00, 5.78951579e+00,
       6.31582632e+00, 6.84213684e+00, 7.36844737e+00, 7.89475789e+00,
       8.42106842e+00, 8.94737895e+00, 9.47368947e+00, 1.00000000e+01])

In [20]:
alphas = np.linspace(0.0001, 10, 20)
scores = []
for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append(r2_score(y_test, y_pred))

In [21]:
i_max = np.argmax(scores)
print("Best alpha:", alphas[i_max])
print("Best Score:", scores[i_max])

Best alpha: 0.0001
Best Score: 0.5771752776681036
