In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np

In [15]:
df = pd.read_csv('EDA_data1.csv')
q95 = df['price'].quantile(0.95)
df_filtered = df[df['price'] < q95].copy()

In [16]:
top_models = df_filtered['model'].value_counts().nlargest(50).index
df_filtered['model_grouped'] = df_filtered['model'].apply(lambda x: x if x in top_models else 'Other')

In [17]:
def extract_hp(text):
    match = re.search(r'(\d+\.?\d*)HP', str(text))
    if match:
        return float(match.group(1))
    return None

df_filtered['horsepower'] = df_filtered['engine'].apply(extract_hp)
df_filtered['horsepower'] = df_filtered['horsepower'].fillna(df_filtered['horsepower'].median())

In [18]:
features = ['brand', 'model_grouped', 'model_year', 'milage', 'fuel_type', 'transmission', 'accident', 'horsepower']
target = 'price'

categorical_features = ['brand', 'model_grouped', 'fuel_type', 'transmission']
numerical_features = ['model_year', 'milage', 'accident', 'horsepower']

X = df_filtered[features]
y = df_filtered[target]

In [19]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ])


model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200, random_state=42)) # increased trees
])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
model.fit(X_train, y_train)

In [22]:
y_pred = model.predict(X_test)

In [23]:
score = r2_score(y_test, y_pred)
print(f"New R-squared Score: {score}")

New R-squared Score: 0.760339801787544
