**Clean Data**

In [None]:
import pandas as pd


df = pd.read_csv('carData.csv')
df.loc[df['fuel_type'].isnull() & df['engine'].str.contains('Electric Motor', case=False, na=False), 'fuel_type'] = 'Electric'

df.loc[df['fuel_type'].isnull()].head()
df.loc[df['fuel_type'].isnull() & (df['engine'].str.lower() == 'electric'), 'fuel_type'] = 'Electric'
df.loc[df['fuel_type'].isnull() & (df['brand'].str.lower() == 'tesla'), 'fuel_type'] = 'Electric'
df.loc[df['fuel_type'].isnull() & df['engine'].str.contains('battery', case=False, na=False), 'fuel_type'] = 'Electric'

df['milage'] = df['milage'].str.replace(r'[^\d]', '', regex=True).astype(int)
df['price'] = df['price'].replace(r'[$,]', '', regex=True).astype(int)

df['fuel_type'] = df['fuel_type'].replace('Plug-In Hybrid', 'Hybrid')
df['clean_title'] = df['clean_title'].fillna('No')
df['accident'] = df['accident'].fillna('Unknown')
df.to_csv('cleaned_car_data.csv',index=False)

df = pd.read_csv('cleaned_car_data.csv')
df['model'].unique().shape[0]


# df.head()

1898

In [50]:
import pandas as pd
import plotly.express as px


df = pd.read_csv('cleaned_car_data.csv')
df.columns


px.histogram(df, x='price',nbins=25)

**Split Data**

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# 1. Load the dataset
df = pd.read_csv('cleaned_car_data.csv')
df = df[df['price'] < 250000]  # Optional cutoff to reduce extreme outliers

# 2. Convert 'clean_title' to binary
df['clean_title'] = df['clean_title'].map({'Yes': 1, 'No': 0}).astype(int)

# 3. Target encode 'brand' and 'model'
df['brand_encoded'] = df.groupby('brand')['price'].transform('mean')
df['model_encoded'] = df.groupby('model')['price'].transform('mean')

# 4. List of features
numerical_features = ['model_year', 'milage']
categorical_features = ['fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident']  # removed 'model'
binary_features = ['clean_title']
target_encoded_features = ['brand_encoded', 'model_encoded']

# 5. One-hot encode remaining categorical features
df_encoded = pd.get_dummies(df[categorical_features], drop_first=True)

# 6. Combine all features into X
X = pd.concat([df[numerical_features + binary_features + target_encoded_features], df_encoded], axis=1)
y = df['price']

# 7. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 8. Train the RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

# 9. Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R^2 Score: {r2:.2f}")

# 10. Feature importances
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

feature_importance_df.to_csv('test.csv', index=False)


Mean Squared Error: 91154503.38
Mean Absolute Error: 5130.60
R^2 Score: 0.93


In [11]:
import plotly.express as px
import pandas as pd
# Create a DataFrame for comparison
results_df = pd.DataFrame({
    'Actual Price': y_test,
    'Predicted Price': y_pred,
    'Brand': df.loc[y_test.index, 'brand']
})



fig = px.scatter(
    results_df,
    x='Actual Price',
    y='Predicted Price',
    title='Actual vs Predicted Car Prices',
    labels={'Actual Price': 'Actual Price ($)', 'Predicted Price': 'Predicted Price ($)'},
    opacity=0.6,
    trendline="ols",
    hover_name='Brand'
)


fig.add_shape(
    type="line",
    x0=results_df['Actual Price'].min(), y0=results_df['Actual Price'].min(),
    x1=results_df['Actual Price'].max(), y1=results_df['Actual Price'].max(),
    line=dict(color="red", dash="dash"),
    name='Perfect Prediction'
)


fig.show()


In [12]:

# 1. Correct error calculation
results_df['Error'] = results_df['Actual Price'] - results_df['Predicted Price']

# 2. Compute average error per brand
avg_errors = results_df.groupby('Brand')['Error'].mean().reset_index(name='Avg Error')

# 3. Merge average error back into results_df (optional)
results_df = results_df.merge(avg_errors, on='Brand')
# Group to get one avg error per brand
avg_error_df = results_df.groupby('Brand', as_index=False)['Error'].mean()
avg_error_df.rename(columns={'Error': 'Avg Error'}, inplace=True)
avg_error_df = avg_error_df.sort_values(by='Avg Error')
# Now plot
fig = px.bar(
    avg_error_df,
    x='Brand',
    y='Avg Error',
    color = 'Brand',
    title='Average Prediction Error by Brand',
    labels={'Avg Error': 'Average Error ($)'},
)
fig.show()