In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml

In [2]:
boston = fetch_openml(name='boston', version=1, parser='pandas')
df = boston.data
df['MEDV'] = boston.target
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
X = df.drop(columns=['MEDV']).apply(pd.to_numeric)
y = df['MEDV'].apply(pd.to_numeric)

In [4]:
#univeriate selection = yauta matrai value herxa
from sklearn.feature_selection import SelectKBest, f_regression
selector = SelectKBest(score_func = f_regression, k=5)
selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]

sc = pd.DataFrame(data={'cols':X.columns, 'scores': selector.scores_})
print(sc.sort_values(by = 'scores', ascending=False))
print(selected_features)

       cols      scores
12    LSTAT  601.617871
5        RM  471.846740
10  PTRATIO  175.105543
2     INDUS  153.954883
9       TAX  141.761357
4       NOX  112.591480
0      CRIM   89.486115
8       RAD   85.914278
6       AGE   83.477459
1        ZN   75.257642
11        B   63.054229
7       DIS   33.579570
3      CHAS   15.971512
Index(['INDUS', 'RM', 'TAX', 'PTRATIO', 'LSTAT'], dtype='object')


In [5]:
#Bivariate Statistics Filter Method

cor = df.corr()['MEDV'].abs().sort_values(ascending=False)

threshhold = 0.5
selected_features = cor[cor>threshhold]
print(cor)
print("selected Features: \n", selected_features)

new_df = df[selected_features.index]
new_df.corr()

MEDV       1.000000
LSTAT      0.737663
RM         0.695360
PTRATIO    0.507787
INDUS      0.483725
TAX        0.468536
NOX        0.427321
CRIM       0.388305
RAD        0.381626
AGE        0.376955
ZN         0.360445
B          0.333461
DIS        0.249929
CHAS       0.175260
Name: MEDV, dtype: float64
selected Features: 
 MEDV       1.000000
LSTAT      0.737663
RM         0.695360
PTRATIO    0.507787
Name: MEDV, dtype: float64


Unnamed: 0,MEDV,LSTAT,RM,PTRATIO
MEDV,1.0,-0.737663,0.69536,-0.507787
LSTAT,-0.737663,1.0,-0.613808,0.374044
RM,0.69536,-0.613808,1.0,-0.355501
PTRATIO,-0.507787,0.374044,-0.355501,1.0


In [6]:
#wrapper method
import statsmodels.api as sm

#add a sonstant column to the features
X_with_const = sm.add_constant(X)

#Fit the model with all features
model = sm.OLS(y, X_with_const).fit()
print(model.summary())

#Perform backward elimination
while len(model.pvalues[model.pvalues > 0.05]) > 0:
    feature_to_remove = model.pvalues[model.pvalues > 0.05].idxmax()
    X_with_const = X_with_const.drop(columns=[feature_to_remove])
    model = sm.OLS(y, X_with_const).fit()
    
print("Remaining Features after Backward Elimination: ", X_with_const.columns)

                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Fri, 02 Aug 2024   Prob (F-statistic):          6.72e-135
Time:                        19:09:38   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         36.4595      5.103      7.144      0.0

In [7]:
#RFE = recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

#Instantiate a linear regression model
model = LinearRegression()

#Apply RFE
rfe = RFE(model, n_features_to_select = 7)
rfe_model = rfe.fit(X, y)

#Get selected features
selected_features = X.columns[rfe_model.support_]
print("Selected Features using RFEL: ", selected_features)

Selected Features using RFEL:  Index(['CHAS', 'NOX', 'RM', 'DIS', 'RAD', 'PTRATIO', 'LSTAT'], dtype='object')


In [8]:
#Embedded Method
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

#Instatiate a Lasso model
lasso = Lasso(alpha = 0.1)

#Fit the model
lasso.fit(X, y)

#Use SelectFramModel to select features
model = SelectFromModel(lasso, prefit=True)
selected_features = X.columns[(model.get_support())]
print("Selected Features usinf Embedded Method(Lasso): ", selected_features)

Selected Features usinf Embedded Method(Lasso):  Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
      dtype='object')
