## Importing libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

##Loading and exploring dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DL projects/Aeman's project/ford.csv") #enter file path here
#kaggle dataset credit > https://www.kaggle.com/datasets/adityadesai13/used-car-dataset-ford-and-mercedes?select=ford.csv

In [None]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17965 entries, 0 to 17964
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17965 non-null  object 
 1   year          17965 non-null  int64  
 2   price         17965 non-null  int64  
 3   transmission  17965 non-null  object 
 4   mileage       17965 non-null  int64  
 5   fuelType      17965 non-null  object 
 6   tax           17965 non-null  int64  
 7   mpg           17965 non-null  float64
 8   engineSize    17965 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 1.2+ MB


##Preprocessing

In [None]:
#Converting miles per gallon into litres per 100km
df['mpg'] = 235/df['mpg']
df.rename(columns={'mpg': 'L/100km'}, inplace=True)

In [None]:
#Finding all kinds of unique values in the 'model' attribute
df['model'] = df['model'].str.strip()
print(df['model'].unique())

['Fiesta' 'Focus' 'Puma' 'Kuga' 'EcoSport' 'C-MAX' 'Mondeo' 'Ka+'
 'Tourneo Custom' 'S-MAX' 'B-MAX' 'Edge' 'Tourneo Connect' 'Grand C-MAX'
 'KA' 'Galaxy' 'Mustang' 'Grand Tourneo Connect' 'Fusion' 'Ranger'
 'Streetka' 'Escort' 'Transit Tourneo']


In [None]:
#Downsizing the dataset to the only two models that make up 62% of the dataset
df = df[df['model'].isin(['Focus','Fiesta'])]
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,L/100km,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,4.07279,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,4.07279,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,4.07279,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,5.831266,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,4.825462,1.0


In [None]:
#Converting categorical attributes to quantitative
#Creating dummy variables
#One-hot encoding
df = pd.get_dummies(df, columns=['model'], drop_first=False)
df = pd.get_dummies(df, columns=['transmission'], drop_first=False)
df = pd.get_dummies(df, columns=['fuelType'], drop_first=False)
df.head()

Unnamed: 0,year,price,mileage,tax,L/100km,engineSize,model_Fiesta,model_Focus,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,fuelType_Diesel,fuelType_Petrol
0,2017,12000,15944,150,4.07279,1.0,True,False,True,False,False,False,True
1,2018,14000,9083,150,4.07279,1.0,False,True,False,True,False,False,True
2,2017,13000,12456,150,4.07279,1.0,False,True,False,True,False,False,True
3,2019,17500,10460,145,5.831266,1.5,True,False,False,True,False,False,True
4,2019,16500,1482,145,4.825462,1.0,True,False,True,False,False,False,True


In [None]:
#Converting all the True/False values into 0s and 1s
df[['model_Fiesta','model_Focus']] = df[['model_Fiesta','model_Focus']].astype(int)
df[['transmission_Automatic','transmission_Manual','transmission_Semi-Auto']] = df[['transmission_Automatic','transmission_Manual','transmission_Semi-Auto']].astype(int)
df[['fuelType_Diesel','fuelType_Petrol']] = df[['fuelType_Diesel','fuelType_Petrol']].astype(int)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11145 entries, 0 to 17963
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   year                    11145 non-null  int64  
 1   price                   11145 non-null  int64  
 2   mileage                 11145 non-null  int64  
 3   tax                     11145 non-null  int64  
 4   L/100km                 11145 non-null  float64
 5   engineSize              11145 non-null  float64
 6   model_Fiesta            11145 non-null  int64  
 7   model_Focus             11145 non-null  int64  
 8   transmission_Automatic  11145 non-null  int64  
 9   transmission_Manual     11145 non-null  int64  
 10  transmission_Semi-Auto  11145 non-null  int64  
 11  fuelType_Diesel         11145 non-null  int64  
 12  fuelType_Petrol         11145 non-null  int64  
dtypes: float64(2), int64(11)
memory usage: 1.2 MB


In [None]:
#Checking if there are any null values in the dataframe
Na_vals = df.isna().sum()
print("Number of Na values in the dataset: ")
print(Na_vals)

Number of Na values in the dataset: 
year                      0
price                     0
mileage                   0
tax                       0
L/100km                   0
engineSize                0
model_Fiesta              0
model_Focus               0
transmission_Automatic    0
transmission_Manual       0
transmission_Semi-Auto    0
fuelType_Diesel           0
fuelType_Petrol           0
dtype: int64


In [None]:
#Removing any duplicate rows (there seem to be around 100 such rows)
df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11064 entries, 0 to 17963
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   year                    11064 non-null  int64  
 1   price                   11064 non-null  int64  
 2   mileage                 11064 non-null  int64  
 3   tax                     11064 non-null  int64  
 4   L/100km                 11064 non-null  float64
 5   engineSize              11064 non-null  float64
 6   model_Fiesta            11064 non-null  int64  
 7   model_Focus             11064 non-null  int64  
 8   transmission_Automatic  11064 non-null  int64  
 9   transmission_Manual     11064 non-null  int64  
 10  transmission_Semi-Auto  11064 non-null  int64  
 11  fuelType_Diesel         11064 non-null  int64  
 12  fuelType_Petrol         11064 non-null  int64  
dtypes: float64(2), int64(11)
memory usage: 1.2 MB


## Train Test Split

In [None]:
X = df.drop('price', axis=1)
Y = df['price']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=0)

##Normalizing

In [None]:
def normalize(X):
    mean = np.mean(X, axis = 0)
    std = np.std(X, axis = 0)
    X_normalized = (X - mean)/std
    return X_normalized

In [None]:
x_train = normalize(x_train)
x_test = normalize(x_test)

## Training & Testing

Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
rf = RandomForestRegressor(random_state=42)
print("Training Random Forest~")
rf.fit(x_train, y_train)
rf_predictions = rf.predict(x_test)

Training Random Forest~


In [None]:
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print("Results for Random Forest:")
print(f"Mean Squared Error: {rf_mse:.4f}")
print(f"Mean Absolute Error: {rf_mae:.4f}")
print(f"R^2 Score: {rf_r2:.4f}")

Results for Random Forest:
Mean Squared Error: 1129008.1893
Mean Absolute Error: 774.3102
R^2 Score: 0.9250


Support Vector Machine

In [None]:
from sklearn.svm import SVR

In [None]:
svm = SVR()
print("Training SVM~")
svm.fit(x_train, y_train)
svm_predictions = svm.predict(x_test)

Training SVM~


In [None]:
svm_mse = mean_squared_error(y_test, svm_predictions)
svm_mae = mean_absolute_error(y_test, svm_predictions)
svm_r2 = r2_score(y_test, svm_predictions)

print("Results for SVM:")
print(f"Mean Squared Error: {svm_mse:.4f}")
print(f"Mean Absolute Error: {svm_mae:.4f}")
print(f"R^2 Score: {svm_r2:.4f}")

Results for SVM:
Mean Squared Error: 12549104.9135
Mean Absolute Error: 2596.7347
R^2 Score: 0.1662


Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dt = DecisionTreeRegressor(random_state=42)
print("Training Decision Tree~")
dt.fit(x_train, y_train)
dt_predictions = dt.predict(x_test)

Training Decision Tree~


In [None]:
dt_mse = mean_squared_error(y_test, dt_predictions)
dt_mae = mean_absolute_error(y_test, dt_predictions)
dt_r2 = r2_score(y_test, dt_predictions)

print("Results for Decision Tree:")
print(f"Mean Squared Error: {dt_mse:.4f}")
print(f"Mean Absolute Error: {dt_mae:.4f}")
print(f"R^2 Score: {dt_r2:.4f}")

Results for Decision Tree:
Mean Squared Error: 1903246.0138
Mean Absolute Error: 991.9860
R^2 Score: 0.8735


Bayesian Ridge

In [None]:
from sklearn.linear_model import BayesianRidge

In [None]:
br = BayesianRidge()
print("Training Bayesian Ridge~")
br.fit(x_train, y_train)
br_predictions = br.predict(x_test)

Training Bayesian Ridge~


In [None]:
br_mse = mean_squared_error(y_test, br_predictions)
br_mae = mean_absolute_error(y_test, br_predictions)
br_r2 = r2_score(y_test, br_predictions)

print("\nResults for Bayesian Ridge:")
print(f"Mean Squared Error: {br_mse:.4f}")
print(f"Mean Absolute Error: {br_mae:.4f}")
print(f"R^2 Score: {br_r2:.4f}")


Results for Bayesian Ridge:
Mean Squared Error: 2710572.6046
Mean Absolute Error: 1290.5906
R^2 Score: 0.8199


In [None]:
results = {}

#1.Random Forest
results["Random Forest"] = {"Mean Squared Error": rf_mse, "Mean Absolute Error": rf_mae, "R^2 Score": rf_r2}

#2.Support Vector Machine (SVM)
results["SVM"] = {"Mean Squared Error": svm_mse, "Mean Absolute Error": svm_mae, "R^2 Score": svm_r2}

#3.Decision Tree
results["Decision Tree"] = {"Mean Squared Error": dt_mse, "Mean Absolute Error": svm_mae, "R^2 Score": dt_r2}

#4.Bayesian Ridge
results["Bayesian Ridge"] = {"Mean Squared Error": br_mse, "Mean Absolute Error": br_mae, "R^2 Score": br_r2}

results_df = pd.DataFrame(results).T  #Transpose for better readability

In [None]:
print("Model Comparison:")
print(results_df)

Model Comparison:
                Mean Squared Error  Mean Absolute Error  R^2 Score
Random Forest         1.129008e+06           774.310155   0.924982
SVM                   1.254910e+07          2596.734731   0.166158
Decision Tree         1.903246e+06          2596.734731   0.873536
Bayesian Ridge        2.710573e+06          1290.590580   0.819892


In [None]:
best_model = results_df["R^2 Score"].idxmax()
print(f"The best model is: {best_model}")

The best model is: Random Forest
