# **Voting Regressor**

## **Importing Libararies**

In [49]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score

## **Data Overview**

In [24]:
df = pd.read_csv('Car_Data.csv')
df.head()

Unnamed: 0,ID,Brand,Model,Year,Color,Mileage,Price,Condition
0,1,Mazda,Generic Model 3,2003,Blue,22932,65732,New
1,2,Jaguar,Generic Model 2,2003,Silver,89937,58620,New
2,3,Land Rover,Generic Model 2,2022,Green,36616,50574,New
3,4,Porsche,Generic Model 2,1997,Black,82812,35436,Used
4,5,Land Rover,Generic Model 3,2000,Black,184616,63880,Used


In [25]:
df.columns

Index(['ID', 'Brand', 'Model', 'Year', 'Color', 'Mileage', 'Price',
       'Condition'],
      dtype='object')

In [26]:
df.isnull().sum()

ID           0
Brand        0
Model        0
Year         0
Color        0
Mileage      0
Price        0
Condition    0
dtype: int64

In [27]:
df.describe

<bound method NDFrame.describe of            ID       Brand            Model  Year   Color  Mileage  Price  \
0           1       Mazda  Generic Model 3  2003    Blue    22932  65732   
1           2      Jaguar  Generic Model 2  2003  Silver    89937  58620   
2           3  Land Rover  Generic Model 2  2022   Green    36616  50574   
3           4     Porsche  Generic Model 2  1997   Black    82812  35436   
4           5  Land Rover  Generic Model 3  2000   Black   184616  63880   
...       ...         ...              ...   ...     ...      ...    ...   
99995   99996       Lexus  Generic Model 1  2018     Red    24034  31762   
99996   99997      Nissan  Generic Model 1  2015   Green    30029  78376   
99997   99998  Land Rover  Generic Model 1  2009   White    42313  45681   
99998   99999      Toyota           Tacoma  2010   White   120989  15085   
99999  100000  Volkswagen  Generic Model 2  2017    Blue   138318  22866   

      Condition  
0           New  
1           New  

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ID         100000 non-null  int64 
 1   Brand      100000 non-null  object
 2   Model      100000 non-null  object
 3   Year       100000 non-null  int64 
 4   Color      100000 non-null  object
 5   Mileage    100000 non-null  int64 
 6   Price      100000 non-null  int64 
 7   Condition  100000 non-null  object
dtypes: int64(4), object(4)
memory usage: 6.1+ MB


In [29]:
df['Price'].mean()

42534.51937

In [30]:
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
categorical_columns = df.select_dtypes(exclude=['number']).columns.tolist()

In [31]:
numerical_columns

['ID', 'Year', 'Mileage', 'Price']

In [32]:
categorical_columns

['Brand', 'Model', 'Color', 'Condition']

In [33]:
numerical_columns = np.delete(numerical_columns, 0)
numerical_columns = np.delete(numerical_columns, len(numerical_columns)-1)

In [34]:
numerical_columns

array(['Year', 'Mileage'], dtype='<U7')

## **Train Test Split**

In [35]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Price']), df['Price'],
                                                    test_size=0.2,
                                                    random_state=42)

## **Data Preprocessing**

In [36]:
handle_numerical = Pipeline(steps=[
    ('impute_numerical', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

In [37]:
handle_categorical = Pipeline(steps=[
    ('impute_numerical', SimpleImputer(strategy='most_frequent')),
    ('encode_categorical', OrdinalEncoder())
])

In [38]:
preprocessing = ColumnTransformer(transformers=[
    ('numerical', handle_numerical, numerical_columns),
    ('categorical', handle_categorical, categorical_columns)
], remainder='passthrough')

## **Separate Models**

In [39]:
model_lr = LinearRegression()
model_gb = GradientBoostingRegressor()

In [40]:
pipe_lr = make_pipeline(preprocessing, model_lr)
pipe_gb = make_pipeline(preprocessing, model_gb)

In [41]:
pipe_lr.fit(X_train, y_train)

In [42]:
pipe_gb.fit(X_train, y_train)

In [43]:
y_pred_lr = pipe_lr.predict(X_test)
y_pred_gb = pipe_gb.predict(X_test)

In [47]:
print(r2_score(y_pred_lr, y_test))

-8036.6883918305075


In [48]:
print(r2_score(y_pred_gb, y_test))

-944.0084200355121


## **Voting Regressor**

In [52]:
voting = VotingRegressor(estimators=[
    ('lr', pipe_lr),
    ('gb', pipe_gb)
])

In [53]:
voting.fit(X_train, y_train)

In [54]:
y_pred_voting = voting.predict(X_test)

In [55]:
print(r2_score(y_pred_voting, y_test))

-2890.553800288122
