In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# File path for the dataset
toyota_file_path = r"ToyotaCorolla.csv"

# Load dataset
cars = pd.read_csv(toyota_file_path)

# Display column names to confirm correct labels
print("Column names in dataset:", cars.columns)

# Selecting relevant features
features = ["Age_08_04", "KM", "Fuel_Type", "HP", "Automatic", "Doors", "Quarterly_Tax",
            "Mfr_Guarantee", "Guarantee_Period", "Airco", "Automatic_airco", "CD_Player",
            "Powered_Windows", "Sport_Model", "Tow_Bar"]

# Select predictors (X) and target variable (y)
X = cars[features]
y = cars["Price"]

# *Fix: Convert categorical column 'Fuel_Type' into numerical format*
X = pd.get_dummies(X, columns=["Fuel_Type"], drop_first=True)

# Train-validation-test split (50%-30%-20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.5, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=42)

# Train a multiple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on validation data
y_pred = model.predict(X_val)

# Evaluate model performance
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

# Print evaluation metrics
print("\nModel Performance Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared Score: {r2:.2f}")

Column names in dataset: Index(['Id', 'Model', 'Price', 'Age_08_04', 'Mfg_Month', 'Mfg_Year', 'KM',
       'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'cc', 'Doors', 'Cylinders',
       'Gears', 'Quarterly_Tax', 'Weight', 'Mfr_Guarantee', 'BOVAG_Guarantee',
       'Guarantee_Period', 'ABS', 'Airbag_1', 'Airbag_2', 'Airco',
       'Automatic_airco', 'Boardcomputer', 'CD_Player', 'Central_Lock',
       'Powered_Windows', 'Power_Steering', 'Radio', 'Mistlamps',
       'Sport_Model', 'Backseat_Divider', 'Metallic_Rim', 'Radio_cassette',
       'Tow_Bar'],
      dtype='object')

Model Performance Metrics:
Mean Absolute Error (MAE): 911.88
Root Mean Squared Error (RMSE): 1271.21
R-squared Score: 0.89
