<a href="https://colab.research.google.com/github/thatvernon-yes/CCMACLRL_EXERCISES_COM222_T2/blob/main/Exercise1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 1

Use all feature selection methods to find the best features

## Dataset Information

## Features

Number of Instances: 20640

Number of Attributes: 8 numeric, predictive attributes and the target

Attribute Information:

MedInc - median income in block group

HouseAge - median house age in block group

AveRooms - average number of rooms per household

AveBedrms - average number of bedrooms per household

Population - block group population

AveOccup - average number of household members

Latitude - block group latitude

Longitude - block group longitude

## Target
The target variable is the median house value for California districts, expressed in hundreds of thousands of dollars ($100,000).

##DATASET

In [247]:
from sklearn.datasets import fetch_california_housing
from sklearn.feature_selection import VarianceThreshold
import pandas as pd

In [248]:
housing = fetch_california_housing(as_frame=True)
housing_features = pd.DataFrame(housing.data, columns=housing.feature_names)
housing_target = pd.DataFrame(housing, columns=["target"])
df = pd.concat([housing_features, housing_target], axis=1)


###Base Model

In [249]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error


X = df.drop(["target"],axis=1)
y = df['target']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42)

mlr_model = LinearRegression()

mlr_model.fit(X_train, y_train)
y_pred = mlr_model.predict(X_train)

base = root_mean_squared_error(y_test,y_pred)

score_list = {} #for saving score variables into it

score_list["Base Model"] = base
print(f"Score is {base,b2}")

Score is (1.4667070548344803, 2.1512295847012353)


##1. Use any filter method to select the best features

###Mutual Info Reg

In [250]:
from sklearn.feature_selection import mutual_info_regression

threshold = 5  # the number of most relevant features
high_score_features = []
feature_scores = mutual_info_regression(housing.data, housing.target, random_state=0)

In [251]:
for score, f_name in sorted(zip(feature_scores, housing_features.columns), reverse=True)[:threshold]:
        print(f_name, score)
        high_score_features.append(f_name)
housing_mir = housing_features[high_score_features]

Longitude 0.4018195828289901
MedInc 0.38764599736258987
Latitude 0.3706566756286751
AveRooms 0.10362300978707584
AveOccup 0.07254907984228254


###Model using MIR

In [252]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = df.drop(["HouseAge","AveBedrms","Population","target"],axis=1)
y = df['target']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42)

mlr_model = LinearRegression()

mlr_model.fit(X_train, y_train)
y_pred = mlr_model.predict(X_train)

filter_mse = root_mean_squared_error(y_test,y_pred)

score_list["MIR Model"] = filter_mse
print(f"Score is {filter_mse}")

Score is 1.4538265095530913


##2. Use any wrapper method to select the best features

In [253]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

threshold = 5 # the number of most relevant features
model_rf = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
selector = RFE(model_rf, n_features_to_select=5, step=1)

selector = selector.fit(housing.data, housing.target)
selector_ind = selector.get_support()
housing_wrapper_rfr = housing_features.iloc[:, selector_ind]
housing_wrapper_rfr.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveOccup', 'Latitude'], dtype='object')

###Wrapper Model

In [254]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = df.drop(["AveBedrms","Population","Longitude","target"],axis=1)
y = df['target']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42)

mlr_model = LinearRegression()

mlr_model.fit(X_train, y_train)
y_pred = mlr_model.predict(X_train)

wrap_mse = root_mean_squared_error(y_test,y_pred)


score_list["WRP Model"] = wrap_mse
print(f"Score is {wrap_mse}")

Score is 1.4310636457939558


##3. Use any embedded methood to select the best features

In [255]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor


model_rf = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
model_rf.fit(housing.data,housing.target)

sel_sfm = SelectFromModel(model_rf, prefit=True)
sel_sfm_index = sel_sfm.get_support()
housing_sfm = housing_features.iloc[:, sel_sfm_index]
housing_sfm.columns


Index(['MedInc', 'AveOccup'], dtype='object')

In [256]:
from sklearn.model_selection import train_test_split

X = df.drop(["target"],axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42)

model_rf = RandomForestRegressor(n_estimators=500, random_state=0, max_depth = 3)
model_rf.fit(X_train,y_train)

embeded_rmse = root_mean_squared_error(y_test,model_rf.predict(X_test))


score_list["RFR Model"] = embeded_rmse
print(f"Score is {embeded_rmse}")


Score is 0.7689261239415338


##Scores

In [257]:
score_list = list(score_list.items())

for alg,score in score_list:
    print(f"{alg} Score is {str(score)[:6]} ")

Base Model Score is 1.4667 
MIR Model Score is 1.4538 
WRP Model Score is 1.4310 
RFR Model Score is 0.7689 
