In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


In [4]:
cars_data = pd.read_csv("D:\\car.csv")


In [7]:
cars_data.head(2)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0


In [4]:
cars_data.shape


(8128, 12)

In [None]:
#preprocessing


In [None]:
#Null Check

In [5]:
cars_data.isnull().sum()

name                    0
year                    0
selling_price           0
km_driven               0
fuel                    0
seller_type             0
transmission            0
owner                   0
mileage(km/ltr/kg)    221
engine                221
max_power             215
seats                 221
dtype: int64

In [6]:
cars_data.dropna(inplace=True)

In [7]:
cars_data.shape


(7907, 12)

In [None]:
# Duplicate check

In [8]:
cars_data.duplicated().sum()


np.int64(1189)

In [9]:
cars_data.drop_duplicates(inplace=True)

In [10]:
cars_data.shape


(6718, 12)

In [11]:
cars_data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.40,1248.0,74,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.70,1497.0,78,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.00,1396.0,90,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.10,1298.0,88.2,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8121,Maruti Wagon R VXI BS IV with ABS,2013,260000,50000,Petrol,Individual,Manual,Second Owner,18.90,998.0,67.1,5.0
8122,Hyundai i20 Magna 1.4 CRDi,2014,475000,80000,Diesel,Individual,Manual,Second Owner,22.54,1396.0,88.73,5.0
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,First Owner,18.50,1197.0,82.85,5.0
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.80,1493.0,110,5.0


In [18]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6718 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                6718 non-null   object 
 1   year                6718 non-null   int64  
 2   selling_price       6718 non-null   int64  
 3   km_driven           6718 non-null   int64  
 4   fuel                6718 non-null   object 
 5   seller_type         6718 non-null   object 
 6   transmission        6718 non-null   object 
 7   owner               6718 non-null   object 
 8   mileage(km/ltr/kg)  6718 non-null   float64
 9   engine              6718 non-null   float64
 10  max_power           6718 non-null   object 
 11  seats               6718 non-null   float64
dtypes: float64(3), int64(3), object(6)
memory usage: 682.3+ KB


In [None]:
# Data analysis

In [12]:
for col in cars_data.columns:
    print('unique value of'+ col)
    print(cars_data[col].unique())
    print("============")

unique value ofname
['Maruti Swift Dzire VDI' 'Skoda Rapid 1.5 TDI Ambition'
 'Honda City 2017-2020 EXi' ... 'Tata Nexon 1.5 Revotorq XT'
 'Ford Freestyle Titanium Plus Diesel BSIV'
 'Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV']
unique value ofyear
[2014 2006 2010 2007 2017 2001 2011 2013 2005 2009 2016 2012 2002 2015
 2018 2019 2008 2020 1999 2000 2003 2004 1994 1998 1997 1995 1996]
unique value ofselling_price
[  450000   370000   158000   225000   130000   440000    96000    45000
   350000   200000   500000    92000   280000   180000   400000   778000
   150000   680000   174000   950000   525000   600000   575000   275000
   300000   220000   254999   670000   730000   650000   330000   366000
  1149000   425000  2100000   925000   675000   819999   390000  1500000
   700000  1450000  1090000   850000  1650000  1750000  1590000  1689999
  1425000   265000   190000   630000   540000   448000   745000  1025000
   235000  1700000  1200000   610000  2500000   484999   315000   29000

In [None]:
#Explore and Understand the Data

In [None]:
# Standardize categorical columns
def standardize_text(series):
    return series.str.strip().str.lower().str.replace(r'[^a-z0-9 ]', '', regex=True)

for col in ['fuel', 'seller_type', 'transmission', 'owner']:
    if col in cars_data.columns:
        cars_data[col] = standardize_text(cars_data[col].astype(str))

# Convert year, selling_price, km_driven to numeric (if not already)
for col in ['year', 'selling_price', 'km_driven']:
    if col in cars_data.columns:
        cars_data[col] = pd.to_numeric(cars_data[col], errors='coerce')

# Handle outliers: remove rows with year, selling_price, or km_driven outside 1st and 99th percentiles
for col in ['year', 'selling_price', 'km_driven']:
    if col in cars_data.columns:
        q_low = cars_data[col].quantile(0.01)
        q_high = cars_data[col].quantile(0.99)
        cars_data = cars_data[(cars_data[col] >= q_low) & (cars_data[col] <= q_high)]

# Show cleaned data info
cars_data.info()
cars_data.head()

### Using LLMs for EDA Insights

**Sample LLM Prompt:**
> Given the following distributions and relationships in the car dataset, suggest what patterns or issues to look for. Recommend additional visualizations if needed.

**Simulated LLM Response:**
- Look for skewness in `selling_price` and `km_driven`.
- Check if newer cars have higher selling prices.
- Investigate if outliers exist in `selling_price` or `km_driven`.
- Consider visualizing categorical features (e.g., boxplots of `selling_price` by `fuel` or `transmission`).

**How LLM Helped:**
- Provided interpretation of visual patterns.
- Suggested further visualizations for deeper insights.

In [None]:
# Visualize distributions of numerical features
num_cols = ['year', 'selling_price', 'km_driven']
for col in num_cols:
    if col in cars_data.columns:
        plt.figure(figsize=(6, 3))
        sns.histplot(cars_data[col], kde=True)
        plt.title(f'Distribution of {col}')
        plt.show()

# Boxplots for outlier detection
for col in num_cols:
    if col in cars_data.columns:
        plt.figure(figsize=(6, 3))
        sns.boxplot(x=cars_data[col])
        plt.title(f'Boxplot of {col}')
        plt.show()

# Pairplot to visualize relationships
sns.pairplot(cars_data[num_cols])
plt.show()

# Correlation heatmap
plt.figure(figsize=(5, 4))
sns.heatmap(cars_data[num_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Prepare features and target
target = 'selling_price'
features = [col for col in cars_data.columns if col != target]

# One-hot encode categorical features
data = pd.get_dummies(cars_data, columns=[col for col in features if cars_data[col].dtype == 'object'], drop_first=True)

X = data.drop(target, axis=1)
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Train Random Forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Train Decision Tree
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

# Evaluate models
import numpy as np
def print_metrics(model, X_test, y_test, name):
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    # Calculate MAPE (Mean Absolute Percentage Error)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    print(f"--- {name} ---")
    print("MAE:", mae)
    print("RMSE:", rmse)
    print("R2:", r2)
    print(f"MAPE: {mape:.2f}% (Mean Absolute Percentage Error)")

print_metrics(lr, X_test, y_test, "Linear Regression")
print_metrics(rf, X_test, y_test, "Random Forest")
print_metrics(dt, X_test, y_test, "Decision Tree")

# Predict selling price for the first 5 cars in the test set using all models
print("\nPredicted selling prices (Linear Regression):", lr.predict(X_test[:5]))
print("Predicted selling prices (Random Forest):", rf.predict(X_test[:5]))
print("Predicted selling prices (Decision Tree):", dt.predict(X_test[:5]))
print("Actual selling prices:", y_test[:5].values)

## Predicting Car Selling Prices

In this section, we will:
- Prepare the data for modeling
- Train two regression models (Linear Regression and Random Forest)
- Evaluate their performance
- Predict how much a used car will sell for using the trained models

This step answers the main project goal: predicting the selling price of a used car.

## Exploratory Data Analysis (EDA)

In this section, we will:
- Visualize the distributions of key features
- Explore relationships between variables
- Use LLMs to suggest and interpret visualizations

Understanding the data visually helps identify patterns, trends, and potential issues before modeling.

## Data Cleaning and Preprocessing

In this section, we will:
- Standardize categorical columns (e.g., fuel, seller_type, transmission, owner)
- Convert columns to appropriate data types
- Handle outliers in numerical columns (e.g., year, selling_price, km_driven)
- Ensure the data is ready for analysis and modeling

These steps are based on LLM suggestions and best practices for tabular data.