**Load Packages and Import Data**

In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor


df = pd.read_csv('carData.csv')
df.loc[df['fuel_type'].isnull() & df['engine'].str.contains('Electric Motor', case=False, na=False), 'fuel_type'] = 'Electric'

df.loc[df['fuel_type'].isnull()].head()
df.loc[df['fuel_type'].isnull() & (df['engine'].str.lower() == 'electric'), 'fuel_type'] = 'Electric'
df.loc[df['fuel_type'].isnull() & (df['brand'].str.lower() == 'tesla'), 'fuel_type'] = 'Electric'
df.loc[df['fuel_type'].isnull() & df['engine'].str.contains('battery', case=False, na=False), 'fuel_type'] = 'Electric'

df['milage'] = df['milage'].str.replace(r'[^\d]', '', regex=True).astype(int)
df['price'] = df['price'].replace(r'[$,]', '', regex=True).astype(int)


df['fuel_type'] = df['fuel_type'].replace('Plug-In Hybrid', 'Hybrid')
df['clean_title'] = df['clean_title'].fillna('No')
df['accident'] = df['accident'].fillna('Unknown')
df.to_csv('cleaned_car_data.csv',index=False)

df = pd.read_csv('cleaned_car_data.csv')
df.describe()
# df.head()

Unnamed: 0,model_year,milage,price
count,4009.0,4009.0,4009.0
mean,2015.51559,64717.55101,44553.19
std,6.104816,52296.599459,78710.64
min,1974.0,100.0,2000.0
25%,2012.0,23044.0,17200.0
50%,2017.0,52775.0,31000.0
75%,2020.0,94100.0,49990.0
max,2024.0,405000.0,2954083.0


**Split Data**

In [11]:
target = 'price'
categorical_cols = ['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col']
binary_cols = ['accident', 'clean_title']
numerical_cols = ['model_year', 'milage', 'engine']


X = df[categorical_cols + binary_cols + numerical_cols]

X['accident'] = eX['accident'].fillna('Unknown')
X['clean_title'] = X['clean_title'].fillna('Unknown')

y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
2537,Hyundai,Equus Signature,2011,"143,898 mi.",Gasoline,378.0HP 4.6L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Gray,Brown,None reported,Yes
788,Mercedes-Benz,AMG C 43 Base 4MATIC,2017,"54,500 mi.",Gasoline,362.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,9-Speed A/T,White,Black,None reported,Yes
3737,Ford,F-150 Tremor,2023,"5,500 mi.",Gasoline,400.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Gray,Black,None reported,Yes
1266,INFINITI,Q50 Premium,2014,"79,785 mi.",Gasoline,328.0HP 3.7L V6 Cylinder Engine Gasoline Fuel,7-Speed A/T,Red,White,At least 1 accident or damage reported,Yes
1612,Subaru,Impreza 2.0i Premium,2013,"112,000 mi.",Gasoline,148.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,CVT Transmission,Black,Beige,None reported,Yes


In [None]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', 'passthrough', numerical_features)
])

pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('rf', RandomForestRegressor())
])

pipeline.fit(X_train, y_train)


In [None]:
y_pred = pipeline.predict(X_test)
print(mean_squared_error(y_test, y_pred, squared=False))  # RMSE
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
