In [38]:
import pandas as pd 

In [39]:
df = pd.read_csv("carvana.csv")

In [40]:
df.head()

Unnamed: 0,Name,Year,Miles,Price
0,Chevrolet Trax,2018,41946,16990
1,GMC Terrain,2020,45328,23990
2,Jeep Wrangler,2012,81068,21590
3,Jeep Renegade,2019,35372,21590
4,BMW X,20173,68992,22990


In [41]:
# find all the cars with year > 2025 
df_filtered = df[df["Year"] > 2025]

In [42]:
df_filtered

Unnamed: 0,Name,Year,Miles,Price
4,BMW X,20173,68992,22990
31,Mitsubishi Mirage G,20194,24065,15990
34,MAZDA CX-,20165,66644,18590
39,MAZDA MAZDA,20133,100316,12990
46,FIAT,2013500,44087,15990
...,...,...,...,...
21952,BMW X,20151,35706,22990
21963,MAZDA MX- Miata,20165,42003,22590
21970,MAZDA MAZDA,20123,89624,13990
21975,BMW X,20151,35706,22990


In [43]:
df["Year"].min() 

2009

In [44]:
df["Year"].max() 

20223500

In [45]:
df["Year"] = df["Year"].astype(str).str[:4].astype(int)  

In [46]:
df.head() 

Unnamed: 0,Name,Year,Miles,Price
0,Chevrolet Trax,2018,41946,16990
1,GMC Terrain,2020,45328,23990
2,Jeep Wrangler,2012,81068,21590
3,Jeep Renegade,2019,35372,21590
4,BMW X,2017,68992,22990


In [47]:
df["Year"].max() 

2023

In [48]:
df["Miles"].min() 

53

In [49]:
df["Miles"].max() 

120167

In [50]:
# standardize Miles column 
from sklearn.preprocessing import StandardScaler 

In [51]:
# create the scaler 
scaler = StandardScaler() 

In [52]:
df["Miles"] = scaler.fit_transform(df[["Miles"]])

In [53]:
df.head() 

Unnamed: 0,Name,Year,Miles,Price
0,Chevrolet Trax,2018,-0.486647,16990
1,GMC Terrain,2020,-0.354977,23990
2,Jeep Wrangler,2012,1.036478,21590
3,Jeep Renegade,2019,-0.742591,21590
4,BMW X,2017,0.566327,22990


In [54]:
# get mean and standard deviation 
mean = scaler.mean_[0]
std_dev = scaler.scale_[0]

In [55]:
print("Mean:", mean)
print("Standard Deviation:", std_dev)

Mean: 54445.69731818182
Standard Deviation: 25685.350148367597


In [56]:
from sklearn.preprocessing import LabelEncoder 

In [57]:
# create label encoder 
le = LabelEncoder() 

In [58]:
df["Name_encoded"] = le.fit_transform(df["Name"])

In [59]:
df.head() 

Unnamed: 0,Name,Year,Miles,Price,Name_encoded
0,Chevrolet Trax,2018,-0.486647,16990,71
1,GMC Terrain,2020,-0.354977,23990,127
2,Jeep Wrangler,2012,1.036478,21590,195
3,Jeep Renegade,2019,-0.742591,21590,193
4,BMW X,2017,0.566327,22990,20


In [62]:
X = df[["Year", "Miles", "Name_encoded"]]
y = df["Price"]

In [63]:
X

Unnamed: 0,Year,Miles,Name_encoded
0,2018,-0.486647,71
1,2020,-0.354977,127
2,2012,1.036478,195
3,2019,-0.742591,193
4,2017,0.566327,20
...,...,...,...
21995,2014,1.698996,132
21996,2016,0.318092,80
21997,2013,-0.159651,116
21998,2020,-1.031744,359


In [64]:
y

0        16990
1        23990
2        21590
3        21590
4        22990
         ...  
21995    17590
21996    23990
21997    16990
21998    23990
21999    14990
Name: Price, Length: 22000, dtype: int64

In [65]:
from sklearn.ensemble import RandomForestRegressor 

In [66]:
from sklearn.model_selection import train_test_split 

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
len(X_train)

17600

In [69]:
len(X_test) 

4400

In [70]:
# train the random forest regressor 
model = RandomForestRegressor(n_estimators=100, random_state=42) 
model.fit(X_train, y_train) 

In [71]:
# predict and evaluate 
from sklearn.metrics import mean_absolute_error, r2_score 

In [73]:
y_pred = model.predict(X_test) 
mae = mean_absolute_error(y_test, y_pred) 
r2 = r2_score(y_test, y_pred) 

print(f"Mean Absolute Error: ${mae:.2f}")
print(f"R² Score: {r2:.3f}")

Mean Absolute Error: $913.11
R² Score: 0.858


In [74]:
import coremltools as ct 

In [75]:
coreml_model = ct.converters.sklearn.convert(model, ["Year", "Miles", "Name_encoded"], "Price")

In [76]:
# add meta data 
coreml_model.author = "Mohammad Azam" 
coreml_model.license = "MIT"
coreml_model.short_description = "Predicts used car prices based on year, mileage, and car name."
coreml_model.version = "1.0"

In [77]:
coreml_model.save("Carvana.mlmodel")

In [78]:
car_name_mapping = { name: code for name, code in zip(le.classes_, le.transform(le.classes_)) }

In [79]:
car_name_mapping

{' Acura ILX': 0,
 ' Acura MDX': 1,
 ' Acura MDX Sport Hybrid': 2,
 ' Acura RDX': 3,
 ' Acura RLX': 4,
 ' Acura TL': 5,
 ' Acura TLX': 6,
 ' Acura TSX': 7,
 ' Alfa Romeo C Spider': 8,
 ' Alfa Romeo Giulia': 9,
 ' Alfa Romeo Stelvio': 10,
 ' Audi A': 11,
 ' Audi A Sportback e-tron': 12,
 ' Audi Q': 13,
 ' Audi S': 14,
 ' Audi TT': 15,
 ' Audi allroad': 16,
 ' Audi e-tron': 17,
 ' BMW  Series': 18,
 ' BMW M': 19,
 ' BMW X': 20,
 ' BMW Z': 21,
 ' BMW i': 22,
 ' Buick Cascada': 23,
 ' Buick Enclave': 24,
 ' Buick Encore': 25,
 ' Buick Encore GX': 26,
 ' Buick Envision': 27,
 ' Buick LaCrosse': 28,
 ' Buick Lucerne': 29,
 ' Buick Regal': 30,
 ' Buick Regal Sportback': 31,
 ' Buick Verano': 32,
 ' Cadillac ATS': 33,
 ' Cadillac CT': 34,
 ' Cadillac CTS': 35,
 ' Cadillac DTS': 36,
 ' Cadillac ELR': 37,
 ' Cadillac Escalade ESV': 38,
 ' Cadillac SRX': 39,
 ' Cadillac XT': 40,
 ' Cadillac XTS': 41,
 ' Chevrolet Blazer': 42,
 ' Chevrolet Bolt EV': 43,
 ' Chevrolet Camaro': 44,
 ' Chevrolet Capti

In [82]:
# strip whitespace from names 
stripped_mapping = { key.lstrip(): value for key, value in car_name_mapping.items() }

In [83]:
stripped_mapping

{'Acura ILX': 0,
 'Acura MDX': 1,
 'Acura MDX Sport Hybrid': 2,
 'Acura RDX': 3,
 'Acura RLX': 4,
 'Acura TL': 5,
 'Acura TLX': 6,
 'Acura TSX': 7,
 'Alfa Romeo C Spider': 8,
 'Alfa Romeo Giulia': 9,
 'Alfa Romeo Stelvio': 10,
 'Audi A': 11,
 'Audi A Sportback e-tron': 12,
 'Audi Q': 13,
 'Audi S': 14,
 'Audi TT': 15,
 'Audi allroad': 16,
 'Audi e-tron': 17,
 'BMW  Series': 18,
 'BMW M': 19,
 'BMW X': 20,
 'BMW Z': 21,
 'BMW i': 22,
 'Buick Cascada': 23,
 'Buick Enclave': 24,
 'Buick Encore': 25,
 'Buick Encore GX': 26,
 'Buick Envision': 27,
 'Buick LaCrosse': 28,
 'Buick Lucerne': 29,
 'Buick Regal': 30,
 'Buick Regal Sportback': 31,
 'Buick Verano': 32,
 'Cadillac ATS': 33,
 'Cadillac CT': 34,
 'Cadillac CTS': 35,
 'Cadillac DTS': 36,
 'Cadillac ELR': 37,
 'Cadillac Escalade ESV': 38,
 'Cadillac SRX': 39,
 'Cadillac XT': 40,
 'Cadillac XTS': 41,
 'Chevrolet Blazer': 42,
 'Chevrolet Bolt EV': 43,
 'Chevrolet Camaro': 44,
 'Chevrolet Captiva Sport': 45,
 'Chevrolet City Express': 46,


In [87]:
clean_mapping = {str(k): int(v) for k, v in stripped_mapping.items()}

In [88]:
# save to JSON 
import json 

with open("car_name_mapping.json", "w") as file_object: 
    json.dump(clean_mapping, file_object, indent=4) 