In [1]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score as r
from sklearn.metrics import mean_absolute_error as mae
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
import numpy as np
import pickle
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [2]:
path="selected_features.csv"

In [3]:
df=pd.read_csv(path,index_col=0)

In [4]:
k_folds=10

# creating train test split

In [5]:
x=df.drop(columns=["selling_price"])
y=df["selling_price"]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [6]:
for index,col in enumerate(df.columns):
    
    print(index,col)

0 company
1 year_of_purchase
2 km_driven
3 fuel_type
4 transmission_type
5 no_of_seats
6 selling_price


# feature engineering

In [7]:
#filling all categories with ordinal encoding

In [8]:
feature_engineering=ColumnTransformer([
    ("order2",OrdinalEncoder(handle_unknown="use_encoded_value",unknown_value=-1),["company","fuel_type","transmission_type"]),
    ("scale",StandardScaler(),[1,2])],remainder="passthrough",verbose_feature_names_out=False)

In [9]:
pipe=Pipeline([
    ("feature_engineering",feature_engineering),
    ("reg", LinearRegression())
])

In [10]:
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
scores=cross_val_score(pipe,x,y,cv=kf,scoring="r2")

In [11]:
scores.mean()

0.4711179583596504

In [12]:
scores.std()

0.040394428841327754

In [13]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [14]:
pipe.fit(x_train,y_train)

y_pred=pipe.predict(x_test)

r(y_test,y_pred)

mae(y_test,y_pred)

336522.04766174516

In [15]:
model_dict = {
    "linear_reg": LinearRegression(),
    "svr": SVR(),
    "ridge": Ridge(),
    "decision_tree": DecisionTreeRegressor(),
    "random_forest": RandomForestRegressor(),
    "extra_trees": ExtraTreesRegressor(),
    "gradient_boosting": GradientBoostingRegressor(),
    "adaboost": AdaBoostRegressor(),
    "xgboost": XGBRegressor()
}

In [16]:
def scorer(model_name,model):
    
    output=[]
    
    output.append(model_name)
    
    pipe=Pipeline([
    ("feature_engineering",feature_engineering),
    ("reg", model)
    ])
    
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    scores=cross_val_score(pipe,x,y,cv=kf,scoring="r2")
    
    output.append(scores.mean())
    
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

    pipe.fit(x_train,y_train)

    y_pred=pipe.predict(x_test)

    r(y_test,y_pred)

    output.append(mae(y_test,y_pred))
    
    return output


In [17]:
model_output=[]

for model_name,model in model_dict.items():
    model_output.append(scorer(model_name,model))

In [18]:
pd.DataFrame(model_output,columns=["model","rscore","mae"]).sort_values(by="rscore",ascending=False)

Unnamed: 0,model,rscore,mae
8,xgboost,0.940301,108159.132548
4,random_forest,0.932456,110012.965534
5,extra_trees,0.921562,106251.81367
3,decision_tree,0.917007,118882.269464
6,gradient_boosting,0.889802,147576.52185
7,adaboost,0.578242,319351.108869
2,ridge,0.471124,314979.79717
0,linear_reg,0.471118,310281.852255
1,svr,-0.060525,375485.534345


# filling nomial values with onehot and ordinal with ordinalencoder

In [19]:
feature_engineering=ColumnTransformer([
    ("onehot",OneHotEncoder(sparse=False,handle_unknown='ignore'),["company","fuel_type","transmission_type"]) 
],remainder="passthrough",verbose_feature_names_out=False)

In [20]:
pipe=Pipeline([
    ("feature_engineering",feature_engineering),
    ("reg",RandomForestRegressor(max_depth=15,min_samples_split=2,n_estimators=80))
])

In [21]:
kf = KFold(n_splits=k_folds, shuffle=True)
scores=cross_val_score(pipe,x,y,cv=kf,scoring="r2")

In [22]:
scores.mean()

0.9216223772757349

In [23]:
scores.std()

0.048741130566138186

In [None]:
model_output=[]

for model_name,model in model_dict.items():
    model_output.append(scorer(model_name,model))

In [None]:
pd.DataFrame(model_output,columns=["model","rscore","mae"]).sort_values(by="rscore",ascending=False)

# target encoding column having very high no of categories

In [None]:
#pip install category_encoders

In [None]:
import category_encoders as ce

In [None]:
feature_engineering=ColumnTransformer([
    ("onehot",OneHotEncoder(sparse=False,handle_unknown='ignore'),["fuel_type","transmission_type"]),
    ("target_enc",ce.TargetEncoder(),["company"])
],remainder="passthrough",verbose_feature_names_out=False)

In [None]:
pipe=Pipeline([
    ("feature_engineering",feature_engineering),
    ("reg",RandomForestRegressor(max_depth=15,min_samples_split=2,n_estimators=80))
])

In [None]:
k_folds=10

In [None]:
kf = KFold(n_splits=k_folds, shuffle=True)
scores=cross_val_score(pipe,x,y,cv=kf,scoring="r2")

In [None]:
scores.mean()

In [None]:
scores.std()

In [None]:
scores

In [None]:
model_output=[]

for model_name,model in model_dict.items():
    model_output.append(scorer(model_name,model))

In [None]:
pd.DataFrame(model_output,columns=["model","rscore","mae"]).sort_values(by="rscore",ascending=False)

# chosing model,category handling method which giving more r2score and less mae

In [None]:
feature_engineering=ColumnTransformer([
    ("onehot",OneHotEncoder(sparse=False,handle_unknown='ignore'),["fuel_type","transmission_type"]),
    ("target_enc",ce.TargetEncoder(),["company"])
],remainder="passthrough",verbose_feature_names_out=False)

In [None]:
reg=XGBRegressor()

In [None]:
pipe=Pipeline([
    ("feature_engineering",feature_engineering),
    ("reg",reg)
])

In [None]:
pipe.fit(x_train,y_train)

In [None]:
pickle.dump(pipe,open("pipe.pkl","wb"))

In [None]:
pickle.dump(df,open("dataframe.pkl","wb"))