<a href="https://colab.research.google.com/github/anujott-codes/ford-car-price-prediction/blob/main/Car_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
adhurimquku_ford_car_price_prediction_path = kagglehub.dataset_download('adhurimquku/ford-car-price-prediction')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Loading

In [None]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("/kaggle/input/ford-car-price-prediction/ford.csv")
df.head()

In [None]:
df.shape

In [None]:
#basic info about the data
df.info()

In [None]:
#statistical knowledge
df.describe()

# *Problems:*
* Future Year (2060)
* Too high mpg (201.8)
* engineSize (0)

In [None]:
#checking null values
df.isnull().sum()

# EDA

In [None]:
#checking for cars with future years
df[df['year'] > 2025]

In [None]:
#just a single record so can be dropped
#dropping false record
df = df[df['year'] <= 2025]

In [None]:
#checking for records with unrealistic too high mpg
df[df['mpg']>100]

In [None]:
# Imputed realistic MPG for Ford Kuga 2020 Hybrid 2.5L
# Source: https://www.ford.co.uk/cars/kuga/specs
df.loc[df['mpg']>100,'mpg'] = 49.7

In [None]:
#rechecking for any unrealistic value and proper imputation
df[df['mpg']>100]

In [None]:
#checking for records with engineSize -> 0
df[df['engineSize'] == 0]

In [None]:
#many records so need imputation
#replacing 0 -> nan
df['engineSize'] = df['engineSize'].replace(0,np.nan)
df.isnull().sum()

In [None]:
#checking median engineSize per model per fueltype
median_engineSizes = df.groupby(['model','fuelType'])['engineSize'].median().reset_index()
median_engineSizes

In [None]:
median_engineSizes.rename(columns={'engineSize': 'median_engineSize'}, inplace=True)

#merging this df to original df
df = df.merge(median_engineSizes, on=['model', 'fuelType'], how='left')

#imputing null values
df['engineSize'] = df['engineSize'].fillna(df['median_engineSize'])

#dropping helper column
df.drop(columns='median_engineSize', inplace=True)

df.head()

In [None]:
#checking for any remaing null value or inconsistency in engineSize
df.isnull().sum()

In [None]:
df[df['engineSize'].isnull()]

In [None]:
# Impute engineSize for 2019 Ford Puma Hybrid with Realistic value
df['engineSize'] = df['engineSize'].fillna(1.0)

In [None]:
#final check for inconsistency
df[df['engineSize'].isnull()]

In [None]:
#final check for inconsistency
df.isnull().sum()

In [None]:
def distplot(col,num):
    plt.subplot(3,2,num)
    sns.histplot(data = df,x = col,kde = True)
    plt.title(f'{col}')
    plt.tight_layout()

In [None]:
plt.figure(figsize=(14,14))
distplot('price',1)
plt.show()

price -> right skewed

In [None]:
#heatmap to check correlation between numeric cols and price
sns.heatmap(df.corr(numeric_only = True),annot=True)

In [None]:
#selecting numeric features
numeric_features = df.select_dtypes(include='number').columns.to_list()
numeric_features.remove('price')
numeric_features

In [None]:
#plotting distribution for all numeric features
plt.figure(figsize=(14,14))
for i,col in enumerate(numeric_features):
    distplot(col,i+1)

plt.show()

In [None]:
#function for plotting countplot
def countplot(col,num):
    plt.subplot(3,2,num)
    sns.countplot(data = df,x = col)
    plt.title(f'{col}')
    plt.xticks(rotation = 90)
    plt.tight_layout()

In [None]:
cat_features = ['model','transmission','fuelType']
#plotting countplot for all categorical features
plt.figure(figsize=(14,14))
for i,col in enumerate(cat_features):
    countplot(col,i+1)

plt.show()

In [None]:
#checking for imbalance in transmission
df['transmission'].value_counts(normalize=True)


In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x='transmission', y='price')
plt.title('Price Distribution by Transmission Type')
plt.show()

In [None]:
#ANOVA for checking significance of transmission
#Null Hypothesis (H₀): All transmission types have same average car price
#Alternative Hypothesis (H₁): At least one transmission type has a different average car price

from scipy.stats import f_oneway

manual_prices = df[df['transmission'] == 'Manual']['price']
auto_prices = df[df['transmission'] == 'Automatic']['price']
semi_auto_prices = df[df['transmission'] == 'Semi-Auto']['price']

f_stat, p_val = f_oneway(manual_prices, auto_prices, semi_auto_prices)
print(f"ANOVA F-statistic: {f_stat:.4f}, p-value: {p_val:.4f}")

Transmission type does affect car price significantly

In [None]:
#checking relation between price and year
sns.boxplot(data=df,x='year',y='price')
plt.xticks(rotation=90)
plt.show()

newer car -> more price

In [None]:
#checking relation between mileage and price
sns.scatterplot(data = df,x='mileage',y='price')
plt.show()

Sort of negative relation

In [None]:
#checking relation of engineSize and price
sns.boxplot(data=df,x='engineSize',y='price')
plt.xticks(rotation=90)
plt.show()

In [None]:
#checking relation between fueltype and price
sns.boxplot(data = df, x = 'fuelType',y = 'price')
plt.show()

In [None]:
#check relation between mpg and price
