# car price prdications

In [93]:
# Import libraries 
import pandas as pd
import numpy as np 
import seaborn as sns

# simple imputer
from sklearn.impute import SimpleImputer

#label encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [79]:
#Load Dataset

tr = pd.read_csv("Data/train.csv")
ts = pd.read_csv("Data/test.csv")
sb = pd.read_csv("Data/sample_submission.csv")



In [80]:
tr.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [81]:
class DataPreprocessor:
    def __init__(self, data: pd.DataFrame):
        self.data = data

    def check_shape(self):
        # Check the shape of the train dataset. -->
        return self.data.shape

    def check_duplicates(self):
    #Check for duplicate rows in traing dataset
        duplicates = self.data.duplicated().sum()
        return f"Number of duplicate rows: {duplicates}"

    def check_nulls(self):
        #Check for null values in the dataset.
        nulls = self.data.isnull().sum()
        null_summary = nulls[nulls > 0]
        return null_summary if not null_summary.empty else "No null values found."

    

# Usage example:

# Load data
tr = pd.read_csv('Data/train.csv')

# Create an instance of DataPreprocessor
preprocessor = DataPreprocessor(tr)

# Check shape
print("Shape of the dataset:", preprocessor.check_shape())

# Check for duplicates
print(preprocessor.check_duplicates())
print("_______________________________")
# Check for null values
print(f"Null values is: \n{preprocessor.check_nulls()}")
print("_______________________________")


Shape of the dataset: (188533, 13)
Number of duplicate rows: 0
_______________________________
Null values is: 
fuel_type       5083
accident        2452
clean_title    21419
dtype: int64
_______________________________


In [88]:
#Drop id in traning data
tr.drop(['id'],axis=1 ,inplace=True)

In [82]:
tr.describe()

Unnamed: 0,id,model_year,milage,price
count,188533.0,188533.0,188533.0,188533.0
mean,94266.0,2015.829998,65705.295174,43878.02
std,54424.933488,5.660967,49798.158076,78819.52
min,0.0,1974.0,100.0,2000.0
25%,47133.0,2013.0,24115.0,17000.0
50%,94266.0,2017.0,57785.0,30825.0
75%,141399.0,2020.0,95400.0,49900.0
max,188532.0,2024.0,405000.0,2954083.0


In [98]:
pip install optuna


Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloading sqlalchemy-2.0.33.tar.gz (9.6 MB)
     ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/9.6 MB 1.3 MB/s eta 0:00:08
     ---------------------------------------- 0.0/9.6 MB 1.3 MB/s eta 0:00:08
     ---------------------------------------- 0.1/9.6 MB 656.4 kB/s eta 0:00:15
     ---------------------------------------- 0.1/9.6 MB 656.4 kB/s eta 0:00:15
      --------------------------------------- 0.2/9.6 MB 614.4 kB/s eta 0:00:16
      --------------------------------------- 0.2/9.6 MB 724.0 kB/s eta 0:00:13
     - ----



In [83]:
# check statical 
tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     183450 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      186081 non-null  object
 11  clean_title   167114 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB




### Dataset Overview

The dataset contains **188,533 records** with **13 features** related to used cars. Below is a detailed description of each column:

1. **id**: A unique identifier for each car entry. This column contains 188,533 non-null integer values.

2. **brand**: The brand or manufacturer of the car. This column is categorical and contains 188,533 non-null string values.

3. **model**: The specific model of the car. This is also a categorical column with 188,533 non-null string values.

4. **model_year**: The manufacturing year of the car. This is a numerical column with 188,533 non-null integer values.

5. **milage**: The total distance the car has traveled, measured in miles. This column is numerical with 188,533 non-null integer values.

6. **fuel_type**: The type of fuel the car uses (e.g., petrol, diesel, electric). This is a categorical column but has **183,450 non-null** string values, meaning **5,083 values are missing**.

7. **engine**: Information about the car's engine, typically including engine size or type. This column contains 188,533 non-null string values.

8. **transmission**: The type of transmission the car has (e.g., manual, automatic). This column contains 188,533 non-null string values.

9. **ext_col**: The exterior color of the car. This is a categorical column with 188,533 non-null string values.

10. **int_col**: The interior color of the car. This column is categorical with 188,533 non-null string values.

11. **accident**: Indicates whether the car has been in an accident. This is a categorical column with **186,081 non-null** string values, meaning **2,452 values are missing**.

12. **clean_title**: Indicates whether the car has a clean title (no major accidents, salvage, etc.). This column has **167,114 non-null** string values, meaning **21,419 values are missing**.

13. **price**: The target variable representing the price of the car. This is a numerical column with 188,533 non-null integer values.

### Missing Data Summary

- **fuel_type**: 5,083 missing values.
- **accident**: 2,452 missing values.
- **clean_title**: 21,419 missing values.



In [73]:
# check categorical 

categorical = tr.select_dtypes(exclude="int64").columns.tolist()
categorical

['brand',
 'model',
 'fuel_type',
 'engine',
 'transmission',
 'ext_col',
 'int_col',
 'accident',
 'clean_title']

### Impute Missing values


In [84]:
# # Define the columns with missing values
# categorical_columns = ['fuel_type', 'accident', 'clean_title']

# Create a SimpleImputer instance with strategy 'most_frequent' for categorical data
imputer = SimpleImputer(strategy='most_frequent')

# Apply the imputer to the categorical columns
tr[categorical] = imputer.fit_transform(tr[categorical])


print(tr[categorical].isnull().sum())

brand           0
model           0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
dtype: int64


In [89]:
# Encode categorical features
label_encoders = {}
for column in categorical:
    le = LabelEncoder()
    tr[column] = le.fit_transform(tr[column])
    label_encoders[column] = le

In [90]:
tr.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,31,495,2007,213000,2,116,38,312,71,1,0,4200
1,28,930,2002,143250,2,366,38,263,10,0,0,4999
2,9,1575,2002,136731,1,640,38,38,71,1,0,13900
3,16,758,2017,19500,2,863,49,29,14,1,0,45000
4,36,1077,2021,7388,2,259,23,29,10,1,0,97500


In [94]:

# Feature selection aur preprocessing
X = tr.drop(['price'], axis=1)
y = tr['price']

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [95]:
X_train.shape

(150826, 11)