In [39]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score as r
from sklearn.metrics import mean_absolute_error as mae
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
path="missing_value_handled_data.csv"

df=pd.read_csv(path)

In [3]:
df["no_of_seats"]=df["no_of_seats"].fillna(df["no_of_seats"].mode()[0])

In [4]:
df=df.drop(columns=["Unnamed: 0"])

In [5]:
# while deploying model feature name should be logical
# (means user can give input bythemsself esaily or we have to make feature in such way that user can select)


In [6]:
df

Unnamed: 0,company,year_of_purchase,km_driven,fuel_type,transmission_type,owner,no_of_seats,selling_price
0,Maruti,2014,145500,Diesel,Manual,First Owner,5.0,450000
1,Skoda,2014,120000,Diesel,Manual,Second Owner,5.0,370000
2,Honda,2006,140000,Petrol,Manual,Third Owner,5.0,158000
3,Hyundai,2010,127000,Diesel,Manual,First Owner,5.0,225000
4,Maruti,2007,120000,Petrol,Manual,First Owner,5.0,130000
...,...,...,...,...,...,...,...,...
7901,Hyundai,2013,110000,Petrol,Manual,First Owner,5.0,320000
7902,Hyundai,2007,119000,Diesel,Manual,Fourth & Above Owner,5.0,135000
7903,Maruti,2009,120000,Diesel,Manual,First Owner,5.0,382000
7904,Tata,2013,25000,Diesel,Manual,First Owner,5.0,290000


In [7]:
label_dataframe=df.copy()

In [8]:
# but this technique only useful for non linear algorithm feature selection

In [9]:
columns=label_dataframe.select_dtypes(include="object").columns

In [10]:
for col in columns:
    
    oe=OrdinalEncoder()
    
    label_dataframe[col]=oe.fit_transform(label_dataframe[[col]])
    
    print(oe.categories)
    

auto
auto
auto
auto


In [11]:
label_dataframe

Unnamed: 0,company,year_of_purchase,km_driven,fuel_type,transmission_type,owner,no_of_seats,selling_price
0,20.0,2014,145500,1.0,1.0,0.0,5.0,450000
1,26.0,2014,120000,1.0,1.0,2.0,5.0,370000
2,10.0,2006,140000,3.0,1.0,4.0,5.0,158000
3,11.0,2010,127000,1.0,1.0,0.0,5.0,225000
4,20.0,2007,120000,3.0,1.0,0.0,5.0,130000
...,...,...,...,...,...,...,...,...
7901,11.0,2013,110000,3.0,1.0,0.0,5.0,320000
7902,11.0,2007,119000,1.0,1.0,1.0,5.0,135000
7903,20.0,2009,120000,1.0,1.0,0.0,5.0,382000
7904,27.0,2013,25000,1.0,1.0,0.0,5.0,290000


In [12]:
x_label=label_dataframe.drop(columns=["selling_price"])
y_label=label_dataframe["selling_price"]

# random forest regrresor

In [13]:
rf_label=RandomForestRegressor(n_estimators=100)
rf_label.fit(x_label,y_label)

In [14]:
df2=pd.DataFrame({
"feature":x_label.columns,
"importance":rf_label.feature_importances_
}).sort_values(by="importance",ascending=False)

In [15]:
df2

Unnamed: 0,feature,importance
4,transmission_type,0.346303
0,company,0.341224
1,year_of_purchase,0.173179
2,km_driven,0.072962
3,fuel_type,0.032655
6,no_of_seats,0.028796
5,owner,0.004881


# gradient boosting feature importance

In [16]:
gb_label=GradientBoostingRegressor(n_estimators=100)
gb_label.fit(x_label,y_label)

In [17]:
df3=pd.DataFrame({
"feature":x_label.columns,
"importance":gb_label.feature_importances_
}).sort_values(by="importance",ascending=False)

In [18]:
df3

Unnamed: 0,feature,importance
4,transmission_type,0.374627
0,company,0.318733
1,year_of_purchase,0.179651
2,km_driven,0.051178
3,fuel_type,0.050949
6,no_of_seats,0.02311
5,owner,0.001752


# permutation importance

In [19]:
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

In [20]:
x_train_label,x_test_label,y_train_label,y_test_label=train_test_split(x_label,y_label,test_size=0.2)

In [21]:
rf_label=RandomForestRegressor(n_estimators=100)
rf_label.fit(x_train_label,y_train_label)

importance=permutation_importance(rf_label,x_test_label,y_test_label,n_repeats=30)

In [22]:
df4=pd.DataFrame({
"feature":x_label.columns,
"importance":importance.importances_mean
}).sort_values(by="importance",ascending=False)

In [23]:
df4

Unnamed: 0,feature,importance
0,company,0.707139
4,transmission_type,0.61386
1,year_of_purchase,0.344783
6,no_of_seats,0.060245
3,fuel_type,0.059005
2,km_driven,0.030449
5,owner,0.008466


In [24]:
final=df2.merge(df3,on="feature").merge(df4,on="feature").set_index("feature")

In [25]:
final

Unnamed: 0_level_0,importance_x,importance_y,importance
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
transmission_type,0.346303,0.374627,0.61386
company,0.341224,0.318733,0.707139
year_of_purchase,0.173179,0.179651,0.344783
km_driven,0.072962,0.051178,0.030449
fuel_type,0.032655,0.050949,0.059005
no_of_seats,0.028796,0.02311,0.060245
owner,0.004881,0.001752,0.008466


In [26]:
normalize=final.divide(final.sum(axis=0),axis=1)

In [27]:
normalize.columns

Index(['importance_x', 'importance_y', 'importance'], dtype='object')

In [28]:
normalize[normalize.columns].mean(axis=1).sort_values(ascending=False)

feature
transmission_type    0.352495
company              0.349218
year_of_purchase     0.180620
km_driven            0.046945
fuel_type            0.038651
no_of_seats          0.028312
owner                0.003758
dtype: float64

# but we cannot randomly delete column without proving it is not useful

In [29]:
# proving by training and testing model by keeping and removing column

In [30]:
from sklearn.model_selection import cross_val_score

In [31]:
rf=RandomForestRegressor(n_estimators=100)
scores=cross_val_score(rf,x_label,y_label,cv=5,scoring="r2")

In [32]:
before=scores.mean()

In [33]:
rf=RandomForestRegressor(n_estimators=100)
scores=cross_val_score(rf,x_label.drop(columns=["owner"]),y_label,cv=5,scoring="r2")

In [34]:
after=scores.mean()

In [35]:
diff=before-after

In [36]:
print(f'difference between before and after removing owner r2 score is {diff}')

difference between before and after removing owner r2 score is 0.0002144963315857673


dropping owner column

In [37]:
df=df.drop(columns=["owner"])

In [38]:
df.to_csv("selected_features.csv")