This is a simple lasso regression model to predict the prices of used cars using the used_cars dataset.

# Loading the libraries

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder #this is not necessary as xgboost does not need it dataset scaled
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import  train_test_split
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor

# importing the dataset

In [6]:
data = pd.read_csv("/used_cars_data.csv") #specify the path to your dataset

data = pd.DataFrame(data)

# Specify the features and the target variable

In [7]:
# Split features and target variable
X = data.drop(columns=["price"])
y = data["price"]

# Splitting into training and test set¶

In [8]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encoding the categorical features

In [9]:
#One hot Encoding and Lable Encoding, if needed
ct = ColumnTransformer(transformers = [("encoder", OneHotEncoder(sparse_output=False, handle_unknown='ignore'),[0,1,4,5,6,7,8,9,10])], remainder = "passthrough")
#Train set
x_train = np.array(ct.fit_transform(X_train))

#Test set
x_test = np.array(ct.transform(X_test))

# Standardizing the dataset

In [10]:
# Standardize features, if needed

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

# Training the Xgboost Regression Model

In [11]:
boost = XGBRegressor(n_estimators=5, learning_rate=0.1, max_depth=7, random_state=42) #vary the hyperparameters to imporve the performace
boost.fit(X_train_scaled, y_train)

In [12]:
# Predict on the validation set
y_pred = boost.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

print("Root Mean Squared Error:", rmse)

Mean Squared Error: 19288662799.46564
Root Mean Squared Error: 138883.6304229755


In [13]:
# Save the results to a file
with open("metrics.txt", "w") as file:
    file.write(f"Mean Squared Error: {mse}\n")
    file.write(f"Root Mean Squared Error: {rmse}\n")

In [14]:
#Saving the Prediction if needed

In [15]:
Prediction_File = pd.DataFrame({'real values': y_test, 'predicted price': y_pred})

# Round the 'predicted price' column to 2 decimal places
Prediction_File['predicted price'] = Prediction_File['predicted price'].round(2)

# Save the DataFrame to a CSV file
Prediction_File.to_csv('Prediction.csv', index=False)