Feature Selection => Backward Selection
Data Ingestion

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
import pandas as pd

In [None]:
path = r"D:\Machine-Learning\repository\Data_Processing\Cars93.csv"

In [None]:
df = pd.read_csv(path,na_values=["","NA"],keep_default_na=False)
df.head()

Perform basic data quality checks

In [None]:
df.info()

In [None]:
m = df.isna().sum()
m[m>0]

In [None]:
df.duplicated().sum()

Drop insignificant columns

In [None]:
df.drop(columns="id",inplace=True)

Separate X and Y

In [None]:
X = df.drop(columns="Weight")
Y = df[["Weight"]]

In [None]:
X.head()

In [None]:
Y.head()

Data PreProcessing and Data Cleaning

In [None]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


In [None]:
cat = list(X.columns[X.dtypes=="object"])
con = list(X.columns[X.dtypes!="object"])

In [None]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder()
)

In [None]:
con_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

In [None]:
pre = ColumnTransformer([("cat",cat_pipe,cat)
                         ,("con",con_pipe,con)]).set_output(transform="pandas")

In [None]:
pre

In [None]:
X_pre = pre.fit_transform(X)

In [None]:
X_pre

Feature Selection => Backward Selection

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

base_model = LinearRegression()

for_sel= SequentialFeatureSelector(base_model,direction="backward",n_features_to_select=10)"auto"

for_sel.fit(X_pre,Y)

for_sel.get_feature_names_out

In [None]:
base_model = LinearRegression()
for_sel= SequentialFeatureSelector(base_model,direction="backward",n_features_to_select="auto")

In [None]:
for_sel.fit(X_pre,Y)

In [None]:
imp_cols = for_sel.get_feature_names_out()
imp_cols

In [None]:
len(imp_cols)

In [None]:
imp_cols

In [None]:
imp_cols[0]

In [None]:
imp_cols[0].split("__")

In [None]:
imp_cols[0].split("__")[1]

In [None]:
sel_cols = [col.split("__")[1] for col in imp_cols]
sel_cols

In [None]:
X_sel = X[sel_cols]
X_sel

Data Preprocessing : 2nd step

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
X_sel_cat = list(X_sel.columns[X_sel.dtypes=="object"])
X_sel_con = list(X_sel.columns[X_sel.dtypes!="object"])

In [None]:
cat_sel_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown='ignore',sparse_output=False)
)

In [None]:
num_sel_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

In [None]:
pre1  = ColumnTransformer([("cat",cat_sel_pipe,X_sel_cat)
                           ,("con",num_sel_pipe,X_sel_con)]).set_output(transform='pandas')

In [None]:
pre1

In [None]:
X_sel_pre = pre1.fit_transform(X_sel)
X_sel_pre

Train test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(X_sel_pre,Y,train_size=0.8,random_state=21)

In [None]:
xtrain.head()

In [None]:
xtest.head()

In [None]:
ytrain.head()

In [None]:
ytest.head()

Build a model

In [None]:
model = LinearRegression()
model.fit(xtrain,ytrain)

In [None]:
model.score(xtrain,ytrain)

In [None]:
model.score(xtest,ytest)

Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [None]:
ypred = model.predict(xtrain)

In [None]:
mse = mean_squared_error(ytrain,ypred)
mse

In [None]:
rmse = mse**(1/2)
rmse

In [None]:
ytrain.head()

In [None]:
ypred[:5]

In [None]:
ypred_test = model.predict(xtest)
ypred_test[:5]

In [None]:
ytest.head()

We can use this model for out of sample predictions since training r2 score is around 98% and testing score is around 92%
Out of sample predctions

In [None]:
xnew = pd.read_csv(r"D:\Machine-Learning\repository\Data_Processing\sample_cars93.csv",
                   na_values=["","NA"],keep_default_na=False)

In [None]:
xnew.head()

In [None]:
pre1

In [None]:
xnew_pre = pre1.transform(xnew)
xnew_pre

In [None]:
preds = model.predict(xnew_pre)
preds

Save the predicted results to dataframe and then to csv file

In [None]:
xnew["WeightPredicted"] = preds
xnew

In [None]:
xnew.to_csv("BackwardSelectionResults.csv",index=False)