# Car Price Prediction

Predict used car selling price using features like year, present price, kilometers driven, fuel type, seller type, transmission, and owner.


In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv('car data.csv')
df.head()

In [None]:
# Basic info
print('Shape:', df.shape)
print('\nColumns:', df.columns.tolist())
print('\nMissing values:\n', df.isnull().sum())


In [None]:
# Feature engineering
# Create Car_Age from Year (assuming dataset year is up to 2020)
df['Car_Age'] = 2020 - df['Year']
# Drop Year and Car_Name (we'll use Car_Age instead of Year and ignore names)
df = df.drop(['Year', 'Car_Name'], axis=1)
# Reorder
cols = ['Selling_Price', 'Present_Price', 'Driven_kms', 'Owner', 'Car_Age', 'Fuel_Type', 'Selling_type', 'Transmission']
df = df[cols]
df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Distribution of target
plt.figure(figsize=(6,4))
sns.histplot(df['Selling_Price'], kde=True)
plt.title('Distribution of Selling Price')
plt.show()

# Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, fmt='.2f')
plt.title('Correlation Matrix')
plt.show()


In [None]:
# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

# Features and target
X = df_encoded.drop('Selling_Price', axis=1)
y = df_encoded['Selling_Price']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('X_train shape:', X_train.shape)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Train Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred_lr = lr.predict(X_test)
y_pred_rf = rf.predict(X_test)

# Evaluation function
def eval_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mae, mse, rmse, r2

lr_metrics = eval_model(y_test, y_pred_lr)
rf_metrics = eval_model(y_test, y_pred_rf)

print('Linear Regression MAE: {:.3f}, RMSE: {:.3f}, R2: {:.3f}'.format(lr_metrics[0], lr_metrics[2], lr_metrics[3]))
print('Random Forest MAE: {:.3f}, RMSE: {:.3f}, R2: {:.3f}'.format(rf_metrics[0], rf_metrics[2], rf_metrics[3]))


In [None]:
# Feature importance from Random Forest
import pandas as pd
feat_imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
feat_imp.plot(kind='bar', figsize=(10,5))
plt.title('Feature Importances')
plt.show()


In [None]:
# Save the trained Random Forest model
import joblib
joblib.dump(rf, 'rf_car_price_model.joblib')
print('Saved model to rf_car_price_model.joblib')


## Conclusion

Random Forest typically outperforms Linear Regression here. Check the R² and RMSE values above to decide which model to use. Save the notebook and include it in your GitHub repo along with `car data.csv`.