1. Data Loading and Initial Exploration

In [31]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pickle
from pathlib import Path
import os
from pathlib import Path

# Define paths using your project structure
DATA_DIR = Path("data")
RAW_DATA_PATH = DATA_DIR / "raw/quikr_car.csv"


# 1. Load raw data with error handling
try:
    car = pd.read_csv(RAW_DATA_PATH)
    print(f"✅ Successfully loaded raw data from {RAW_DATA_PATH}")
    print(f"Shape: {car.shape}")
    
except FileNotFoundError:
    print(f"❌ Error: File not found at {RAW_DATA_PATH}")
    print("Please verify:")
    print(f"- The 'data/raw' directory exists")
    print(f"- The 'quikr_car.csv' file exists in that directory")
    raise
    
except Exception as e:
    print(f"❌ Error loading CSV file: {str(e)}")
    raise


# Initial exploration
print("First 10 rows:")
print(car.head(10))
print("\nDataset shape:", car.shape)
print("\nData types and missing values:")
print(car.info())

✅ Successfully loaded raw data from data\raw\quikr_car.csv
Shape: (892, 6)
First 10 rows:
                                     name   company  year          Price  \
0    Hyundai Santro Xing XO eRLX Euro III   Hyundai  2007         80,000   
1                 Mahindra Jeep CL550 MDI  Mahindra  2006       4,25,000   
2              Maruti Suzuki Alto 800 Vxi    Maruti  2018  Ask For Price   
3  Hyundai Grand i10 Magna 1.2 Kappa VTVT   Hyundai  2014       3,25,000   
4        Ford EcoSport Titanium 1.5L TDCi      Ford  2014       5,75,000   
5        Ford EcoSport Titanium 1.5L TDCi      Ford  2015  Ask For Price   
6                               Ford Figo      Ford  2012       1,75,000   
7                             Hyundai Eon   Hyundai  2013       1,90,000   
8        Ford EcoSport Ambiente 1.5L TDCi      Ford  2016       8,30,000   
9          Maruti Suzuki Alto K10 VXi AMT    Maruti  2015       2,50,000   

   kms_driven fuel_type  
0  45,000 kms    Petrol  
1      40 kms    Dies

2. Data Cleaning and Preprocessing

In [32]:
# Create backup
backup = car.copy()

# Clean year column
car = car[car['year'].str.isnumeric()]
car['year'] = car['year'].astype(int)

# Clean price column
car = car[car['Price'] != 'Ask For Price']
car['Price'] = car['Price'].str.replace(',', '').astype(int)

# Clean kms_driven column
car['kms_driven'] = car['kms_driven'].str.split().str.get(0).str.replace(',', '')
car = car[car['kms_driven'].str.isnumeric()]
car['kms_driven'] = car['kms_driven'].astype(int)

# Handle missing values in fuel_type
car = car[~car['fuel_type'].isna()]

# Simplify car names
car['name'] = car['name'].str.split().str.slice(start=0, stop=3).str.join(' ')

# Remove outliers
car = car[car['Price'] < 6000000]

# Reset index
car = car.reset_index(drop=True)

# Save cleaned data
PROCESSED_DIR = DATA_DIR / "processed"
CLEANED_DATA_PATH = PROCESSED_DIR / "Cleaned_Car_data.csv"
# Ensure the processed directory exists
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
# Save the cleaned data with error handling
try:
    car.to_csv(CLEANED_DATA_PATH, index=False)
    print(f"✅ Successfully saved cleaned data to {CLEANED_DATA_PATH}")
    print(f"File size: {os.path.getsize(CLEANED_DATA_PATH)/1024:.2f} KB")
    
except PermissionError:
    print(f"❌ Permission denied: Could not save to {CLEANED_DATA_PATH}")
    print("Please check your write permissions for this location")
    
except Exception as e:
    print(f"❌ Error saving cleaned data: {str(e)}")
    print("Please verify:")
    print(f"- The directory {PROCESSED_DIR} exists")
    print(f"- You have sufficient disk space")


# Final dataset info
print("\nCleaned dataset info:")
print(car.info())
print("\nDescriptive statistics:")
print(car.describe(include='all'))

✅ Successfully saved cleaned data to data\processed\Cleaned_Car_data.csv
File size: 40.80 KB

Cleaned dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 815 entries, 0 to 814
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        815 non-null    object
 1   company     815 non-null    object
 2   year        815 non-null    int64 
 3   Price       815 non-null    int64 
 4   kms_driven  815 non-null    int64 
 5   fuel_type   815 non-null    object
dtypes: int64(3), object(3)
memory usage: 38.3+ KB
None

Descriptive statistics:
                       name company         year         Price     kms_driven  \
count                   815     815   815.000000  8.150000e+02     815.000000   
unique                  254      25          NaN           NaN            NaN   
top     Maruti Suzuki Swift  Maruti          NaN           NaN            NaN   
freq                     51     221          NaN       

3. Exploratory Data Analysis (EDA)

In [33]:
# Set style for plots
sns.set_style("whitegrid")

# Price distribution by company
plt.figure(figsize=(15, 7))
ax = sns.boxplot(x='company', y='Price', data=car)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha='right')
plt.title('Price Distribution by Company')
plt.tight_layout()
plt.savefig('plots/price_by_company.png')
plt.close()

# Price vs Year
plt.figure(figsize=(20, 10))
ax = sns.swarmplot(x='year', y='Price', data=car, size=3)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha='right')
plt.title('Price vs Manufacturing Year')
plt.tight_layout()
plt.savefig('plots/price_vs_year.png')
plt.close()

# Price vs Kilometers Driven
plt.figure(figsize=(12, 6))
sns.relplot(x='kms_driven', y='Price', data=car, height=6, aspect=1.5)
plt.title('Price vs Kilometers Driven')
plt.tight_layout()
plt.savefig('plots/price_vs_kms.png')
plt.close()

# Price distribution by fuel type
plt.figure(figsize=(14, 7))
sns.boxplot(x='fuel_type', y='Price', data=car)
plt.title('Price Distribution by Fuel Type')
plt.tight_layout()
plt.savefig('plots/price_by_fuel_type.png')
plt.close()

# Multivariate analysis
plt.figure(figsize=(15, 8))
ax = sns.relplot(x='company', y='Price', data=car, hue='fuel_type', size='year', 
                height=7, aspect=2, sizes=(40, 200), alpha=0.7)
ax.set_xticklabels(rotation=40, ha='right')
plt.title('Price by Company, Fuel Type, and Year')
plt.tight_layout()
plt.savefig('plots/multivariate_analysis.png')
plt.close()

  ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha='right')
  ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha='right')


<Figure size 1200x600 with 0 Axes>

<Figure size 1500x800 with 0 Axes>

4. Feature Engineering and Model Preparation

In [34]:
# Prepare features and target
X = car[['name', 'company', 'year', 'kms_driven', 'fuel_type']]
y = car['Price']

# Initialize OneHotEncoder
ohe = OneHotEncoder()
ohe.fit(X[['name', 'company', 'fuel_type']])

# Create column transformer
column_trans = make_column_transformer(
    (OneHotEncoder(categories=ohe.categories_), ['name', 'company', 'fuel_type']),
    remainder='passthrough',
    force_int_remainder_cols=False
)

# Create pipeline
lr = LinearRegression()
pipe = make_pipeline(column_trans, lr)

5. Model Training and Evaluation

In [35]:
# Find best random state for train-test split
best_score = -1
best_random_state = 0
scores = []

for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=i)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    current_score = r2_score(y_test, y_pred)
    scores.append(current_score)
    
    if current_score > best_score:
        best_score = current_score
        best_random_state = i

print(f"\nBest R2 score: {best_score:.4f} at random state: {best_random_state}")

# Train final model with best random state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=best_random_state)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nFinal Model Metrics:")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")


# 1. Define paths (using pathlib for cross-platform compatibility)
MODELS_DIR = Path("models")
MODEL_PATH = MODELS_DIR / "LinearRegressionModel.pkl"

# 2. Create models directory if it doesn't exist
MODELS_DIR.mkdir(exist_ok=True)

# 3. Save the model safely using context manager
try:
    with open(MODEL_PATH, "wb") as f:
        pickle.dump(pipe, f)
    print(f"✅ Model successfully saved to {MODEL_PATH}")
except Exception as e:
    print(f"❌ Failed to save model: {str(e)}")


Best R2 score: 0.8991 at random state: 302

Final Model Metrics:
MAE: 98320.65
MSE: 19368279318.63
RMSE: 139169.97
R² Score: 0.8991
✅ Model successfully saved to models\LinearRegressionModel.pkl


6. Model Interpretation and Analysis

In [36]:
# Feature Importance Analysis
def get_feature_importance(pipe, X_train):
    # Get feature names after one-hot encoding
    ohe = pipe.named_steps['columntransformer'].named_transformers_['onehotencoder']
    feature_names = ohe.get_feature_names_out(['name', 'company', 'fuel_type'])
    feature_names = np.append(feature_names, ['year', 'kms_driven'])
    
    # Get coefficients
    coefficients = pipe.named_steps['linearregression'].coef_
    
    # Create DataFrame for visualization
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': coefficients
    }).sort_values('Importance', key=abs, ascending=False)
    
    return importance_df

# Get and display feature importance
importance_df = get_feature_importance(pipe, X_train)
print("\nTop 10 Most Important Features:")
print(importance_df.head(10))

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
plt.title('Top 20 Most Important Features')
plt.tight_layout()
plt.savefig('plots/feature_importance.png')
plt.close()

# Residual Analysis
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.tight_layout()
plt.savefig('plots/residual_plot.png')
plt.close()


Top 10 Most Important Features:
                          Feature     Importance
254                  company_Audi  934605.465994
172            name_Mini Cooper S  644333.935200
270                  company_Mini  644333.935200
264                company_Jaguar  547290.473510
256             company_Chevrolet -488781.705456
255                   company_BMW  454792.570260
269              company_Mercedes  454511.502420
174  name_Mitsubishi Pajero Sport  435683.732532
275                  company_Tata -415586.932574
271            company_Mitsubishi  384565.814932


7. Model Validation 

In [37]:
from sklearn.model_selection import cross_val_score

# Cross-validation
cv_scores = cross_val_score(pipe, X, y, cv=5, scoring='r2')
print("\nCross-Validation R2 Scores:", cv_scores)
print(f"Mean R2: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")

# Learning Curve (New)
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    pipe, X, y, cv=5, scoring='r2', 
    train_sizes=np.linspace(0.1, 1.0, 10))

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training score')
plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Cross-validation score')
plt.xlabel('Training set size')
plt.ylabel('R2 Score')
plt.title('Learning Curve')
plt.legend()
plt.tight_layout()
plt.savefig('plots/learning_curve.png')
plt.close()


Cross-Validation R2 Scores: [0.38436886 0.75135427 0.57419953 0.68409945 0.52757092]
Mean R2: 0.5843 (±0.1274)


8. Model Deployment Preparation

In [38]:
# Create example predictions with proper DataFrame construction
examples = [
    {'name': 'Maruti Suzuki Swift', 'company': 'Maruti', 'year': 2019, 'kms_driven': 100, 'fuel_type': 'Petrol'},
    {'name': 'Hyundai Creta', 'company': 'Hyundai', 'year': 2020, 'kms_driven': 5000, 'fuel_type': 'Diesel'},

]

print("\nExample Predictions:")
for example in examples:
    # Create DataFrame maintaining original column order
    input_df = pd.DataFrame([example], columns=X.columns)
    
    # Predict
    try:
        pred = pipe.predict(input_df)
        print(f"{list(example.values())} → Predicted Price: ₹{pred[0]:,.2f}")
    except Exception as e:
        print(f"Prediction failed for {example}: {str(e)}")


import json
# Save model metadata
model_metadata = {
    'features': list(X.columns),
    'target': 'Price',
    'metrics': {
        'r2_score': r2,
        'mae': mae,
        'rmse': rmse
    },
    'cv_scores': {
        'mean': np.mean(cv_scores),
        'std': np.std(cv_scores)
    }
}

# Save to model folder
with open('models/model_metadata.json', 'w') as f:
    json.dump(model_metadata, f, indent=4)


Example Predictions:
['Maruti Suzuki Swift', 'Maruti', 2019, 100, 'Petrol'] → Predicted Price: ₹456,670.33
['Hyundai Creta', 'Hyundai', 2020, 5000, 'Diesel'] → Predicted Price: ₹678,732.56
