**Load Packages and Import Data**

In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor


df = pd.read_csv('carData.csv')
df.loc[df['fuel_type'].isnull() & df['engine'].str.contains('Electric Motor', case=False, na=False), 'fuel_type'] = 'Electric'

df.loc[df['fuel_type'].isnull()].head()
df.loc[df['fuel_type'].isnull() & (df['engine'].str.lower() == 'electric'), 'fuel_type'] = 'Electric'
df.loc[df['fuel_type'].isnull() & (df['brand'].str.lower() == 'tesla'), 'fuel_type'] = 'Electric'
df.loc[df['fuel_type'].isnull() & df['engine'].str.contains('battery', case=False, na=False), 'fuel_type'] = 'Electric'

df['milage'] = df['milage'].str.replace(r'[^\d]', '', regex=True).astype(int)
df['price'] = df['price'].replace(r'[$,]', '', regex=True).astype(int)

df['fuel_type'] = df['fuel_type'].replace('Plug-In Hybrid', 'Hybrid')
df['clean_title'] = df['clean_title'].fillna('No')
df['accident'] = df['accident'].fillna('Unknown')
df.to_csv('cleaned_car_data.csv',index=False)

df = pd.read_csv('cleaned_car_data.csv')



# df.head()

**Split Data**

In [56]:
target = 'price'
categorical_features = ['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col','accident']
binary_features = ['clean_title']
numerical_features = ['model_year', 'milage', 'engine']


X = df[categorical_features + binary_features + numerical_features]

y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [57]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', 'passthrough', numerical_features)
])

pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('rf', RandomForestRegressor())
])

pipeline.fit(X_train, y_train)


ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.

In [None]:
y_pred = pipeline.predict(X_test)
print(mean_squared_error(y_test, y_pred, squared=False))  # RMSE
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
