<a href="https://www.kaggle.com/code/ahmed3okka/pg-s4-e9-automl-submission?scriptVersionId=216348880" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Regression of Used Car Prices- Kaggle Playground Series 2024

## Overview

This notebook is part of the 2024 Kaggle Playground Series. The goal of the competition is to predict the price of used cars based on several features extracted from cars.com. The dataset contains 4,009 vehicle listings, with features like brand, model, mileage, fuel type, and accident history, among others. This project walks through a comprehensive approach using different machine learning techniques to predict car prices.


## 1. Import Libraries

In this section, all the essential Python libraries required for data analysis, visualization, and machine learning are imported.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")  # Choose the style you prefer
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn import metrics

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 2. Data Preparation and Cleaning

Here, the dataset is loaded and prepared for analysis. This involves cleaning, handling missing data, and performing any necessary transformations.

In [None]:
df = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
df_sample = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
df.head()

In [None]:
target = df['price']
df = df.drop(['id'],axis=1)
IDs = df_test['id'].copy()
df_test = df_test.drop(['id'],axis=1)

In [None]:
def columns_info(data):
  cols = []
  dtypes = []
  unique_v = []
  n_unique_v = []
  null_count = []
  null_percent = []
      

  for col in data.columns:
    cols.append(col)
    dtypes.append(data[col].dtype)
    unique_v.append(data[col].unique())
    n_unique_v.append(data[col].nunique())
    null_count.append(data[col].isnull().sum())
    null_percent.append(round(data[col].isnull().sum()/data.shape[0],2))
    

  return pd.DataFrame({'cols':cols,'dtypes':dtypes,'unique_v':unique_v,'n_unique_v':n_unique_v,'sum_null':null_count,'%_null':null_percent})

In [None]:
columns_info(df)

In [None]:
columns_info(df_test)

In [None]:
df['transmission'].value_counts()

In [None]:
string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')
    df_test[col] = df_test[col].str.lower().str.replace(' ','_')
    
df['transmission']=df['transmission'].str.replace('a/t','automatic')
df['transmission']=df['transmission'].str.replace('m/t','manual')
df_test['transmission']=df_test['transmission'].str.replace('a/t','automatic')
df_test['transmission']=df_test['transmission'].str.replace('m/t','manual')


df['clean_title'] = df['clean_title'].fillna('No')
df['fuel_type'] = df['fuel_type'].fillna(df['fuel_type'].mode()[0])
df['accident'] = df['accident'].fillna(df['accident'].mode()[0])

df_test['clean_title'] = df_test['clean_title'].fillna('No')
df_test['fuel_type'] = df_test['fuel_type'].fillna(df_test['fuel_type'].mode()[0])
df_test['accident'] = df_test['accident'].fillna(df_test['accident'].mode()[0])

## 3. Exploratory Data Analysis (EDA)


In [None]:
sns.set_theme(style='darkgrid')

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['price'], kde=True, bins=30)
plt.title('Distribution of Car Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x='milage', y='price', data=df)
plt.title('Mileage vs Price')
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='brand', y='price', data=df)
plt.title('Car Brand vs Price')
plt.xticks(rotation=45)
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='fuel_type', y='price', data=df)
plt.title('Fuel Type vs Price')
plt.xlabel('Fuel Type')
plt.ylabel('Price')
plt.show()


In [None]:
plt.figure(figsize=(10, 8))
correlation_matrix = df.select_dtypes(exclude='object').corr()
sns.heatmap(correlation_matrix, annot=True, linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Distribution of Brand
plt.figure(figsize=(10, 6))
sns.countplot(y='brand', data=df, order=df['brand'].value_counts().index)
plt.title('Distribution of Car Brands')
plt.show()

In [None]:
# Distribution of Fuel Type
plt.figure(figsize=(6, 4))
sns.countplot(x='fuel_type', data=df)
plt.title('Distribution of Fuel Type')
plt.show()

## 4. Feature Engineering


In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
df['model_year']=df['model_year'].astype(object)
df_test['model_year']=df_test['model_year'].astype(object)

In [None]:
cat_cols = [col for col in df.select_dtypes('object').columns]

In [None]:
encoder = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)
df[cat_cols]=encoder.fit_transform(df[cat_cols])
df_test[cat_cols]=encoder.transform(df_test[cat_cols])

In [None]:
x = df.drop('price',axis=1)

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,target,test_size=0.2,random_state=42) 

## 5. Model Building


In [None]:
# Example of training a Linear Regression model
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
rmse = mean_squared_error(y_test, y_pred,squared=False)
print(f'RootMean Squared Error: {rmse}')

In [None]:
pred_sub = lr.predict(df_test)

In [None]:
sample_submission = pd.read_csv('../input/playground-series-s4e9/sample_submission.csv')
sample_submission['price'] = pred_sub
sample_submission.to_csv('lr_submission.csv', index=False)

In [None]:
import h2o
print(h2o.__version__)
from h2o.automl import H2OAutoML

h2o.init(max_mem_size='16G')

In [None]:
# %%time
# train = h2o.import_file("../input/playground-series-s4e9/train.csv")
# test = h2o.import_file("../input/playground-series-s4e9/test.csv")

In [None]:
train = h2o.H2OFrame(df)
test = h2o.H2OFrame(df_test)
x = train.columns
y = 'price'
x.remove(y)

In [None]:
aml = H2OAutoML(max_runtime_secs = 3500, seed = 42, project_name = "cars_price")
aml.train(x = x, y = y, training_frame = train)

In [None]:
lb = aml.leaderboard
lb.head()

In [None]:
# The leader model is stored here
aml.leader

In [None]:
pred = aml.predict(test)
pred.head()

In [None]:
h2o.save_model(aml.leader, path = "./product_backorders_model_bin")

In [None]:
sample_submission = pd.read_csv('../input/playground-series-s4e9/sample_submission.csv')
sample_submission.shape

In [None]:
sample_submission['price'] = pred.as_data_frame().values
sample_submission.to_csv('h2o_automl_submission.csv', index=False)

In [None]:
h2o.save_model(aml.leader, path = "submission1.csv")

In [None]:
sample_submission.head()