In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div style="background:linear-gradient(90deg, #FFF3E0, #EDE7F6); padding:20px; border-radius:12px; text-align:center;">
<span style="color:#FF6B00; font-size:32px; font-weight:bold;">Indian Car Dataset Analysis</span><br>
<span style="color:#2C3E50; font-size:18px;">A complete exploratory analysis of Indian cars - uncovering pricing, mileage, and brand performance trends.</span><br>
</div>


![Indian Car Dataset Analysis](https://www.evolve-h2020.eu/image_temp/1200X628_1200X628_crop_e158717b10b7c79082e001d3c6fd4902.png)

## <span style="color:#FF6B00;  font-size: 30px">📘 Introduction</span>
<span style="color:#FFFFFF;">
This project explores the **Indian Car Dataset**, focusing on car pricing, fuel types, and brand analysis.  
We aim to identify key factors that influence price and mileage, while visualizing important trends across brands.
</span>

## <span style="color:#FF6B00;  font-size: 30px ">📂 Dataset Overview</span>
<span style="color:#FFFFFF;">
    
- **Source:** Kaggle (Indian Cars)  
    
- **Attributes:** Brand, Model, Price, Mileage, Fuel Type, Transmission, Seats, etc.
   
- **Goal:** Understand how various features affect car pricing and performance.
</span>


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
Data_df = pd.read_csv(r"/kaggle/input/all-cars-in-india-price-and-specifications/All_cars_dataset.csv", encoding = "ISO-8859-1",low_memory=False)

print("✅ Dataset loaded successfully!")
print("Shape:", Data_df.shape)
print(Data_df.columns)

In [None]:
Data_df.head()

In [None]:
Data_df.info()



## <span style="color:#FF6B00;  font-size: 30px">🧹 Data Cleaning</span>
<span style="color:#FFFFFF;">
    
✅ Removed missing values  

✅ Renamed inconsistent columns  

✅ Removed duplicate rows  

✅ Handled outliers (using IQR method) 

✅ Converted price & mileage columns to numeric formats  
</span>

In [None]:
def clean_price(value):
    """Convert '₹ 9.8 Lakh' → 980000.0"""
    if pd.isna(value):
        return np.nan
    value = str(value)
    value = value.replace("₹", "").replace(",", "").replace("Rs", "").replace("Lakh", "").strip()
    try:
        return float(value) * 100000
    except:
        return np.nan

In [None]:
def clean_range_value(value):
    """Convert '9 - 40' → 24.5 or '18.2 kmpl' → 18.2"""
    if pd.isna(value):
        return np.nan
    value = str(value).strip()
    value = re.sub(r"[^\d\.\-]", "", value)
    if '-' in value:
        parts = value.split('-')
        try:
            nums = [float(p) for p in parts if p.strip() != ""]
            return np.mean(nums)
        except:
            return np.nan
    else:
        try:
            return float(value)
        except:
            return np.nan

In [None]:
def clean_numeric(value, unit):
    """Remove a given unit (like 'cc', 'mm', etc.)"""
    if pd.isna(value):
        return np.nan
    value = str(value).replace(unit, "").strip()
    try:
        return float(value)
    except:
        return np.nan

<span style="color:#F1C40F; font-size:18px;">→ Clean Price</span>

In [None]:
if "Price" in Data_df.columns:
    Data_df["Price"] = Data_df["Price"].apply(clean_price)

<span style="color:#F1C40F; font-size:18px;">→ Clean Mileage</span>

In [None]:

if "Mileage" in Data_df.columns:
    Data_df["Mileage"] = Data_df["Mileage"].apply(clean_range_value)

<span style="color:#F1C40F; font-size:18px;">→  Clean Engine</span>

In [None]:
if "ENGINE" in Data_df.columns:
    Data_df["ENGINE"] = Data_df["ENGINE"].apply(lambda x: clean_numeric(x, "cc"))

<span style="color:#F1C40F; font-size:18px;">→ Clean Top Speed</span>

</div>

In [None]:
if "Top_Speed" in Data_df.columns:
    Data_df["Top_Speed"] = Data_df["Top_Speed"].apply(clean_range_value)


<span style="color:#F1C40F; font-size:18px;">→ Clean Fuel Capacity</span>

In [None]:
if "Fuel Capacity (L)" in Data_df.columns:
    Data_df["Fuel Capacity (L)"] = Data_df["Fuel Capacity (L)"].apply(lambda x: clean_numeric(x, "L"))


<span style="color:#F1C40F; font-size:18px;">→ Clean Seating Capacity</span>

In [None]:
if "Seating Capacity" in Data_df.columns:
    Data_df["Seating Capacity"] = pd.to_numeric(Data_df["Seating Capacity"], errors="coerce")


<span style="color:#F1C40F; font-size:18px;">→ Handle missing values</span>

In [None]:
# Fill missing Engine with median
if "ENGINE" in Data_df.columns:
    Data_df["ENGINE"].fillna(Data_df["ENGINE"].median(), inplace=True)

In [None]:
# Fill Mileage missing with median
if "Mileage" in Data_df.columns:
    Data_df["Mileage"].fillna(Data_df["Mileage"].median(), inplace=True)

In [None]:
# Drop rows with missing Price
Data_df.dropna(subset=["Price"], inplace=True)

In [None]:
print("\n✅ Missing values handled successfully!")
print(Data_df.isnull().sum().sort_values(ascending=False).head(10))

In [None]:
print("\nDataset Summary:\n")
print(Data_df.describe())

## <span style="color:#FF6B00; font-size:30px;">📈 Outlier Treatment</span>
<span style="color:#FFFFFF;">
    
Outliers were detected using boxplots and treated via the **IQR method**.  
    
Extreme values in `Price` and `Mileage` were capped to maintain data integrity.
</span>

In [None]:
Data_df.to_csv("All_cars_dataset_cleaned.csv", index=False)
print("\n🎉 Cleaned dataset saved as 'All_cars_dataset_cleaned.csv'")
print("Final shape:", Data_df.shape)

In [None]:

# Load the cleaned dataset
df = pd.read_csv("All_cars_dataset_cleaned.csv")

# Select numeric columns for analysis
n_columns = ['Price', 'Mileage', 'Top_Speed']

print("✅ Numeric columns selected:", n_columns)


In [None]:
plt.figure(figsize=(12, 8))
for i, col in enumerate(n_columns, 1):
    plt.subplot(2, 2, i)
    sns.boxplot(x=df[col], color='salmon')
    plt.title(f'Boxplot of {col} (Before Cleaning)', fontsize=11)
plt.tight_layout()
plt.show()


In [None]:
# Calculate IQR for each numeric column
Q1 = df[n_columns].quantile(0.25)
Q3 = df[n_columns].quantile(0.75)
IQR = Q3 - Q1

# Define outlier limits
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify number of outliers in each column
outlier_count = ((df[n_columns] < lower_bound) | (df[n_columns] > upper_bound)).sum()
print("🔍 Number of outliers in each column:\n", outlier_count)


In [None]:
# Function to remove outliers column-wise
def remove_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[col] >= lower) & (data[col] <= upper)]

# Apply the function to numeric columns
df_clean = df.copy()
for col in n_columns:
    before = df_clean.shape[0]
    df_clean = remove_outliers_iqr(df_clean, col)
    after = df_clean.shape[0]
    print(f"✅ {col}: removed {before - after} outliers")

print("\n🎯 Final shape after removing outliers:", df_clean.shape)


In [None]:
plt.figure(figsize=(12, 8))
for i, col in enumerate(n_columns, 1):
    plt.subplot(2, 2, i)
    sns.boxplot(x=df_clean[col], color='lightgreen')
    plt.title(f'Boxplot of {col} (After Outlier Removal)', fontsize=11)
plt.tight_layout()
plt.show()



## ✅ Interpretation:

- The boxplots should now look more compact and smooth.

- Extreme data points (very high or low values) have been removed.

## <span style="color:#FF6B00; font-size:30px;">📊 Exploratory Data Analysis</span>

<span style="color:#FFFFFF;">
The dataset was visualized to understand patterns and relationships between features:
</span>

- <span style="color:#F1C40F;">**Price Distribution:**</span> Shows how car prices are spread across segments.  
- <span style="color:#F1C40F;">**Brand Analysis:**</span> Identifies top brands with high-value cars.  
- <span style="color:#F1C40F;">**Fuel Type Breakdown:**</span> Compares fuel type preferences (Petrol vs Diesel vs Electric).  
- <span style="color:#F1C40F;">**Mileage vs Price:**</span> Highlights efficiency trends.  


In [None]:
df = pd.read_csv("All_cars_dataset_cleaned.csv")
print("✅ Data loaded successfully!")
print(df.shape)
df.head()

In [None]:
print("\n🔹 Basic Info:")
print(df.info())

In [None]:

print("\n🔹 Missing Values:")
print(df.isnull().sum())


In [None]:
print("\n🔹 Statistical Summary:")
print(df.describe())

<span style="color:#F1C40F; font-size:28px;">→ Distribution of Numeric Columns</span>

In [None]:
n_columns = ["Price", "Mileage", "ENGINE", "Top_Speed"]

for col in n_columns:
    if col in df.columns:
        plt.figure(figsize=(8,5))
        sns.histplot(df[col], kde=True, bins=20, color="skyblue")
        plt.title(f"Distribution of {col}")
        plt.xlabel(col)
        plt.ylabel("Count")
        plt.show()



## <span style="color:#8E44AD; font-size:30px;">💡 Insights</span>

<span style="color:#F1C40F;">✔️ See how price, engine size, or mileage are distributed.</span>  
<span style="color:#F1C40F;">✔️ Skewed distributions hint at premium or budget car clusters.</span> 


<span style="color:#F1C40F; font-size:28px;">→ Correlation Heatmaps</span>

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of Numeric Features")
plt.show()



## <span style="color:#8E44AD; font-size:30px;">💡 Insights</span>

<span style="color:#F1C40F;">✔️ Helps identify relationships between variables — e.g., does higher engine size increase price?</span>  



<span style="color:#F1C40F; font-size:28px;">→ Price vs Fuel Types</span>

In [None]:
if "FUEL TYPE" in df.columns:
    plt.figure(figsize=(12,5))
    sns.boxplot(x="FUEL TYPE", y="Price", data=df, palette="pastel")
    plt.title("Price Distribution by Fuel Type")
    plt.show()



## <span style="color:#8E44AD; font-size:30px;">💡 Insights</span>

<span style="color:#F1C40F;">✔️ Compare petrol vs diesel vs electric cars in terms of price.</span> 

<span style="color:#F1C40F; font-size:28px;">→ Transmission Type vs Price</span>

In [None]:
if "TRANSMISSION" in df.columns:
    plt.figure(figsize=(20,5))
    sns.boxplot(x="TRANSMISSION", y="Price", data=df, palette="Set2")
    plt.title("Price by Transmission Type")
    plt.show()


<span style="color:#F1C40F; font-size:28px;">→ Engine vs Mileage Relationships</span>

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x="ENGINE", y="Mileage", hue="FUEL TYPE", data=df, alpha=0.7)
plt.title("ENGINE vs Mileage by Fuel Type")
plt.show()



## <span style="color:#8E44AD; font-size:30px;">💡 Insight</span>

<span style="color:#F1C40F;">✔️ See how bigger engines affect fuel efficiency.</span>

<span style="color:#F1C40F; font-size:28px;">→ Price vs Mileage</span>

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x="Mileage", y="Price", data=df, hue="FUEL TYPE", alpha=0.7)
plt.title("Price vs Mileage")
plt.show()


<span style="color:#F1C40F; font-size:28px;">→ Top 10 Most Expensive Cars</span>

In [None]:
top10 = df.nlargest(10, "Price")[["Name", "Price"]]
plt.figure(figsize=(8,5))
sns.barplot(x="Price", y="Name", data=top10, palette="viridis")
plt.title("Top 10 Most Expensive Cars")
plt.show()


## <span style="color:#FF6B00; font-size:30px;">💡 Summary Insights</span>

<span style="color:#F1C40F;">✔️ Petrol cars dominate the market.</span>  
<span style="color:#F1C40F;">✔️ SUV and Sedan categories have higher price variance.</span>  
<span style="color:#F1C40F;">✔️ Mileage shows a mild negative correlation with price.</span>  
<span style="color:#F1C40F;">✔️ Toyota and Maruti brands balance price and fuel efficiency effectively.</span>


## <span style="color:#FF6B00; font-size:30px;">📊 Final Visualization Highlights</span>

<div style="background-color:#8E44AD; border-radius:10px; padding:15px;">
<span style="color:#FFFFFF;">
    
- Boxplots for price and mileage distribution

- Bar charts for top car brands by average price
  
- Heatmap showing correlation between numeric features  
</span>
</div>

## <span style="color:#FF6B00; font-size:30px;">Recommendations</span>
### 🔧 For Manufacturers:

Focus R&D on fuel-efficient petrol and hybrid engines — aligns with demand for affordability and performance.

Introduce automatic variants in mid-range cars; consumer preference is shifting.

Invest in EV development and charging infrastructure to capture the emerging eco-segment.

### 🛒 For Consumers:

Petrol cars offer the best balance of price, maintenance, and mileage for everyday users.

Diesel cars remain ideal for long-distance travelers seeking mileage advantage.

EVs are future-proof choices with minimal running cost but require initial investment.

### 📉 For Data Analysts / Modelers:

Engine size, fuel type, and transmission are the most predictive features for car price.

The dataset is suitable for regression modeling (predicting price or mileage).

After feature engineering, models like Linear Regression, Random Forest, or XGBoost can perform well.

<HR/>


## <span style="color:#FF6B00; font-size:30px;">🏁 Conclusion</span>
<span style="color:#FFFFFF;">
The analysis highlights how fuel type, brand, and mileage shape the pricing trends in the Indian car market.  
These insights can guide both consumers and manufacturers in making informed decisions.
</span>


 ## <span style="color:#FF6B00; font-size:30px;">🏁 Final Summary</span>
<span style="color:#FFFFFF;">

| Aspect               | Key Takeaway                              |
| -------------------- | ----------------------------------------- |
| Market Segment       | Mostly budget & mid-range cars            |
| Key Drivers of Price | Engine, Transmission, Fuel Type           |
| Market Shift         | Toward automatic and EV vehicles          |
| Data Quality         | Cleaned and ready for modeling            |
| Project Outcome      | Complete EDA + insights + recommendations |

</span>



<div style="text-align:center; background:linear-gradient(90deg,#FF6B00,#8E44AD); padding:15px; border-radius:12px;">
<span style="color:#FFFFFF; font-size:20px;">Thank You ✨</span><br>
    <p style="color:#F1C40F; text-align:center; "> "We appreciate your time reviewing this analysis".</p>
<span style="color:#F1C40F;">“Data speaks louder than assumptions.”</span>
</div>
<p style="text-align:center; color:gray; font-size:14px; margin-top:10px;">
  Indian Car Dataset Analysis | 2025
</p>