In [1]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Cell 2: Read Data
df = pd.read_csv('used_car.csv')

In [3]:
# Cell 3: Split Car Names
def split_car_name(name):
    two_word_companies = ['Land Rover', 'Mercedes Benz', 'Maruti Suzuki', 
                         'Mercedes-Benz', 'Rolls Royce', 'Mini Cooper', 'MINI Cooper']
    name_parts = name.split()
    
    if ' '.join(name_parts[:2]) in two_word_companies:
        company = ' '.join(name_parts[:2])
        model = ' '.join(name_parts[2:])
    else:
        company = name_parts[0]
        model = ' '.join(name_parts[1:])
    
    return pd.Series([company, model], index=['company', 'model'])

df[['company', 'model']] = df['car_name'].apply(split_car_name)

In [4]:
df

Unnamed: 0,car_name,car_price_in_rupees,kms_driven,fuel_type,city,year_of_manufacture,company,model
0,Hyundai Grand i10 Magna 1.2 Kappa VTVT [2017-2...,₹ 4.45 Lakh,"22,402 km",Petrol,Mumbai,2016,Hyundai,Grand i10 Magna 1.2 Kappa VTVT [2017-2020]
1,Maruti Suzuki Alto 800 Lxi,₹ 2.93 Lakh,"10,344 km",Petrol,Kolkata,2019,Maruti Suzuki,Alto 800 Lxi
2,Tata Safari XZ Plus New,₹ 22.49 Lakh,"12,999 km",Diesel,Bangalore,2021,Tata,Safari XZ Plus New
3,Maruti Suzuki Ciaz ZXI+,₹ 6.95 Lakh,"45,000 km",Petrol,Thane,2016,Maruti Suzuki,Ciaz ZXI+
4,Jeep Compass Sport Plus 1.4 Petrol [2019-2020],₹ 12 Lakh,"11,193 km",Petrol,Kolkata,2019,Jeep,Compass Sport Plus 1.4 Petrol [2019-2020]
...,...,...,...,...,...,...,...,...
2100,Ford Figo Titanium1.5 TDCi,₹ 3.6 Lakh,"42,158 km",Diesel,Kolkata,2015,Ford,Figo Titanium1.5 TDCi
2101,MINI Cooper Countryman Cooper D,₹ 22 Lakh,"68,862 km",Diesel,Hyderabad,2013,MINI Cooper,Countryman Cooper D
2102,Hyundai Verna 1.6 VTVT SX,₹ 8.38 Lakh,"37,622 km",Petrol,Chennai,2018,Hyundai,Verna 1.6 VTVT SX
2103,Maruti Suzuki Ciaz VXi+ AT,₹ 6.75 Lakh,"64,726 km",Petrol,Mumbai,2017,Maruti Suzuki,Ciaz VXi+ AT


In [5]:
# Cell 4: Clean Price Data
def clean_price(price_str):
    if isinstance(price_str, str):
        if 'Lakh' in price_str:
            price_str = price_str.replace('₹', '').replace('Lakh', '').strip()
            try:
                return float(price_str) * 100000
            except ValueError:
                return None
        elif 'Crore' in price_str:
            price_str = price_str.replace('₹', '').replace('Crore', '').strip()
            try:
                return float(price_str) * 10000000
            except ValueError:
                return None
    return price_str

df['car_price_in_rupees'] = df['car_price_in_rupees'].apply(clean_price)

In [6]:
df

Unnamed: 0,car_name,car_price_in_rupees,kms_driven,fuel_type,city,year_of_manufacture,company,model
0,Hyundai Grand i10 Magna 1.2 Kappa VTVT [2017-2...,445000.0,"22,402 km",Petrol,Mumbai,2016,Hyundai,Grand i10 Magna 1.2 Kappa VTVT [2017-2020]
1,Maruti Suzuki Alto 800 Lxi,293000.0,"10,344 km",Petrol,Kolkata,2019,Maruti Suzuki,Alto 800 Lxi
2,Tata Safari XZ Plus New,2249000.0,"12,999 km",Diesel,Bangalore,2021,Tata,Safari XZ Plus New
3,Maruti Suzuki Ciaz ZXI+,695000.0,"45,000 km",Petrol,Thane,2016,Maruti Suzuki,Ciaz ZXI+
4,Jeep Compass Sport Plus 1.4 Petrol [2019-2020],1200000.0,"11,193 km",Petrol,Kolkata,2019,Jeep,Compass Sport Plus 1.4 Petrol [2019-2020]
...,...,...,...,...,...,...,...,...
2100,Ford Figo Titanium1.5 TDCi,360000.0,"42,158 km",Diesel,Kolkata,2015,Ford,Figo Titanium1.5 TDCi
2101,MINI Cooper Countryman Cooper D,2200000.0,"68,862 km",Diesel,Hyderabad,2013,MINI Cooper,Countryman Cooper D
2102,Hyundai Verna 1.6 VTVT SX,838000.0,"37,622 km",Petrol,Chennai,2018,Hyundai,Verna 1.6 VTVT SX
2103,Maruti Suzuki Ciaz VXi+ AT,675000.0,"64,726 km",Petrol,Mumbai,2017,Maruti Suzuki,Ciaz VXi+ AT


In [7]:
# Cell 5: Clean Kilometers Driven
def clean_kms_driven(kms_driven_str):
    if isinstance(kms_driven_str, str):
        kms_driven_str = kms_driven_str.replace('km', '').replace(',', '').strip()
        try:
            return int(kms_driven_str)
        except ValueError:
            return None
    return kms_driven_str

df['kms_driven'] = df['kms_driven'].apply(clean_kms_driven)

In [8]:
# Cell 6: Clean Fuel Type and Calculate Age
# Clean fuel type
df['fuel_type'] = df['fuel_type'].replace({
    'LPG': 'CNG',
    'Diesel + 1': 'Diesel',
    'Petrol + 1': 'Petrol'
})

# Calculate age
current_year = datetime.datetime.now().year
df['age'] = df['year_of_manufacture'].apply(lambda x: current_year - x)

In [9]:
# Cell 7: Arrange Columns and Create Clean Copy
columns_order = ['company', 'model', 'fuel_type', 'kms_driven', 
                'age', 'car_price_in_rupees', 'city']
df = df[columns_order]
df_clean = df.copy()

In [10]:
# Cell 8: Remove Low Frequency Companies and Hybrid Fuel Type
company_frequency = df_clean['company'].value_counts()
df_clean = df_clean[df_clean['company'].isin(company_frequency[company_frequency >= 15].index)]
df_clean = df_clean[df_clean['fuel_type'] != 'Hybrid']

In [11]:
# Cell 9: Create Dummy Variables
fuel_type_dummies = pd.get_dummies(df_clean['fuel_type'], prefix='fuel_type')
city_dummies = pd.get_dummies(df_clean['city'], prefix='city')

df_clean = pd.concat([df_clean, fuel_type_dummies, city_dummies], axis=1)
df_clean = df_clean.drop(['fuel_type', 'city'], axis=1)

In [12]:
# Cell 10: Enhanced Feature Engineering

# First ensure all numeric columns are properly converted and cleaned
df_clean['car_price_in_rupees'] = pd.to_numeric(df_clean['car_price_in_rupees'], errors='coerce')
df_clean['kms_driven'] = pd.to_numeric(df_clean['kms_driven'], errors='coerce')
df_clean['age'] = pd.to_numeric(df_clean['age'], errors='coerce')

# Drop any rows with null values in these columns
df_clean = df_clean.dropna(subset=['car_price_in_rupees', 'kms_driven', 'age'])

# Now create the categories
try:
    # Create price segments
    df_clean['price_segment'] = pd.qcut(df_clean['car_price_in_rupees'], 
                                      q=5, 
                                      labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
    
    # Create age categories
    df_clean['age_category'] = pd.cut(df_clean['age'], 
                                     bins=[0, 3, 6, 9, 12, float('inf')],
                                     labels=['New', 'Slightly Used', 'Used', 'Old', 'Very Old'])
    
    # Create mileage categories
    df_clean['mileage_category'] = pd.qcut(df_clean['kms_driven'], 
                                          q=5, 
                                          labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
    
    # Create interaction feature
    df_clean['age_kms'] = df_clean['age'] * df_clean['kms_driven']
    
except Exception as e:
    print(f"Error occurred: {str(e)}")
    print("\nDataframe info:")
    print(df_clean[['car_price_in_rupees', 'kms_driven', 'age']].info())

In [13]:
# Cell 11: Advanced Data Preprocessing
from sklearn.preprocessing import RobustScaler, LabelEncoder

# First ensure all features exist and have no nulls
required_features = ['age', 'kms_driven', 'age_kms', 'company', 'model', 
                    'price_segment', 'age_category', 'mileage_category']

# Check if all required columns exist
missing_columns = [col for col in required_features if col not in df_clean.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
    print("Current columns:", df_clean.columns.tolist())
else:
    # Use RobustScaler for numerical features
    numeric_features = ['age', 'kms_driven', 'age_kms']
    robust_scaler = RobustScaler()
    df_clean[numeric_features] = robust_scaler.fit_transform(df_clean[numeric_features])

    # Encode categorical variables
    categorical_features = ['company', 'model', 'price_segment', 'age_category', 'mileage_category']
    for feature in categorical_features:
        le = LabelEncoder()
        df_clean[feature] = le.fit_transform(df_clean[feature].astype(str))

print("\nPreprocessing completed successfully!")


Preprocessing completed successfully!


In [18]:
df

Unnamed: 0,company,model,fuel_type,kms_driven,age,car_price_in_rupees,city
0,Hyundai,Grand i10 Magna 1.2 Kappa VTVT [2017-2020],Petrol,22402,9,445000.0,Mumbai
1,Maruti Suzuki,Alto 800 Lxi,Petrol,10344,6,293000.0,Kolkata
2,Tata,Safari XZ Plus New,Diesel,12999,4,2249000.0,Bangalore
3,Maruti Suzuki,Ciaz ZXI+,Petrol,45000,9,695000.0,Thane
4,Jeep,Compass Sport Plus 1.4 Petrol [2019-2020],Petrol,11193,6,1200000.0,Kolkata
...,...,...,...,...,...,...,...
2100,Ford,Figo Titanium1.5 TDCi,Diesel,42158,10,360000.0,Kolkata
2101,MINI Cooper,Countryman Cooper D,Diesel,68862,12,2200000.0,Hyderabad
2102,Hyundai,Verna 1.6 VTVT SX,Petrol,37622,7,838000.0,Chennai
2103,Maruti Suzuki,Ciaz VXi+ AT,Petrol,64726,8,675000.0,Mumbai


In [14]:
# Cell 12: Feature Selection and Data Split
# Prepare features and target
X = df_clean.drop(['car_price_in_rupees'], axis=1)
y = df_clean['car_price_in_rupees']

# Split the data with a larger training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [15]:
# Cell 13: Model Training and Evaluation
# Initialize models
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# Dictionary to store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }

In [16]:
# Cell 14: Hyperparameter Tuning for Best Model
# Example for XGBoost (usually performs best)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 0.9, 1.0]
}

xgb_model = xgb.XGBRegressor(random_state=42)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                          cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get best model
best_model = grid_search.best_estimator_

In [17]:
# Cell 15: Final Evaluation
# Make predictions with best model
final_predictions = best_model.predict(X_test)

# Calculate final metrics
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_r2 = r2_score(y_test, final_predictions)
final_mae = mean_absolute_error(y_test, final_predictions)

print("\nFinal Model Metrics:")
print(f"Root Mean Squared Error: ₹{final_rmse:,.2f}")
print(f"Mean Absolute Error: ₹{final_mae:,.2f}")
print(f"R-squared Score: {final_r2:.4f}")


Final Model Metrics:
Root Mean Squared Error: ₹1,080,484.16
Mean Absolute Error: ₹267,875.48
R-squared Score: 0.6171
