In [1]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../datasets/car_prices.csv")
df = df.dropna(subset=['sellingprice'])

In [3]:
df['saledate'] = pd.to_datetime(df['saledate'], errors='coerce')

df['saledate'] = df['saledate'].apply(lambda x: x.timestamp() // 3600 if pd.notnull(x) else None)

mean_time_in_hours = df['saledate'].mean()

df['saledate'].fillna(mean_time_in_hours, inplace=True)


  df['saledate'] = pd.to_datetime(df['saledate'], errors='coerce')
  df['saledate'] = pd.to_datetime(df['saledate'], errors='coerce')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['saledate'].fillna(mean_time_in_hours, inplace=True)


In [4]:
df.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,kia motors america inc,20500.0,21500.0,394084.0
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,kia motors america inc,20800.0,21500.0,394084.0
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,45.0,1331.0,gray,black,financial services remarketing (lease),31900.0,30000.0,394796.0
3,2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,41.0,14282.0,white,black,volvo na rep/world omni,27500.0,27750.0,395132.0
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,43.0,2641.0,gray,black,financial services remarketing (lease),66000.0,67000.0,394132.0


In [5]:
for col in df.columns:
    print(col, len(df[col].value_counts()))

year 34
make 96
model 973
trim 1963
body 87
transmission 4
vin 550285
state 64
condition 41
odometer 172278
color 46
interior 17
seller 14262
mmr 1101
sellingprice 1887
saledate 1256


In [6]:
cat_features=['make', 'body', 'transmission', 'state', 'color', 'interior']
drop_features=['trim', 'model', 'vin', 'seller', 'sellingprice']
numeric_features=['year', 'condition', 'odometer', 'mmr', 'saledate']

In [7]:
for feature in cat_features:
    df[feature].fillna('no', inplace=True)


for feature in numeric_features:
    mean_value = df[feature].mean()
    df[feature].fillna(mean_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[feature].fillna('no', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[feature].fillna(mean_value, inplace=True)


In [8]:
print(df.dtypes)

year              int64
make             object
model            object
trim             object
body             object
transmission     object
vin              object
state            object
condition       float64
odometer        float64
color            object
interior         object
seller           object
mmr             float64
sellingprice    float64
saledate        float64
dtype: object


In [9]:
X, y = df.drop(columns=drop_features), df['sellingprice']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [11]:
X_train

Unnamed: 0,year,make,body,transmission,state,condition,odometer,color,interior,mmr,saledate
9649,2011,Ford,Sedan,automatic,ny,33.0,218513.0,red,gray,8775.0,394129.0
199175,2012,Dodge,Minivan,automatic,tn,32.0,99199.0,—,black,8000.0,395082.0
439374,2013,Nissan,SUV,automatic,fl,44.0,41509.0,silver,black,16950.0,397843.0
370929,2011,Mazda,SUV,automatic,wa,41.0,48876.0,green,tan,16500.0,395948.0
177927,2006,Acura,SUV,automatic,nv,23.0,115639.0,white,brown,8750.0,394987.0
...,...,...,...,...,...,...,...,...,...,...,...
110268,2014,Nissan,Sedan,automatic,ca,1.0,25056.0,gray,black,14400.0,395084.0
259178,2008,Kia,Sedan,automatic,oh,25.0,218859.0,white,tan,600.0,395420.0
365839,2011,Maserati,Sedan,automatic,ca,27.0,28668.0,black,red,43800.0,395804.0
131932,2014,GMC,Van,automatic,pa,42.0,6049.0,white,gray,23000.0,394817.0


In [12]:
cat = CatBoostRegressor()

cat.fit(X_train, y_train, cat_features=cat_features)

Learning rate set to 0.106294
0:	learn: 8810.6741729	total: 700ms	remaining: 11m 39s
1:	learn: 7975.4095771	total: 1.38s	remaining: 11m 30s
2:	learn: 7235.2967381	total: 1.84s	remaining: 10m 10s
3:	learn: 6571.1394226	total: 2.21s	remaining: 9m 9s
4:	learn: 5982.1417530	total: 2.7s	remaining: 8m 56s
5:	learn: 5457.0266896	total: 3.14s	remaining: 8m 40s
6:	learn: 4992.7303032	total: 3.52s	remaining: 8m 19s
7:	learn: 4583.1010337	total: 3.87s	remaining: 7m 59s
8:	learn: 4228.7381469	total: 4.36s	remaining: 7m 59s
9:	learn: 3913.8651672	total: 4.7s	remaining: 7m 45s
10:	learn: 3632.3348388	total: 5.06s	remaining: 7m 35s
11:	learn: 3388.2224979	total: 5.5s	remaining: 7m 33s
12:	learn: 3177.9092723	total: 5.91s	remaining: 7m 28s
13:	learn: 2992.3885427	total: 6.27s	remaining: 7m 21s
14:	learn: 2835.8310045	total: 6.67s	remaining: 7m 17s
15:	learn: 2698.6124340	total: 7s	remaining: 7m 10s
16:	learn: 2581.3616538	total: 7.39s	remaining: 7m 7s
17:	learn: 2482.6609878	total: 7.87s	remaining: 7m

<catboost.core.CatBoostRegressor at 0x7efedfbb5a50>

In [13]:
cat.save_model('../../models/model.cb')

In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = cat.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2 Score: {r2}")

Mean Squared Error (MSE): 3096935.6878377753
Mean Absolute Error (MAE): 926.1691450353563
R^2 Score: 0.9674797008331115


In [15]:
X_train.columns

Index(['year', 'make', 'body', 'transmission', 'state', 'condition',
       'odometer', 'color', 'interior', 'mmr', 'saledate'],
      dtype='object')

In [16]:
for col in cat_features:
    print(col, list(set(X_train[col])))

make ['no', 'volkswagen', 'Mitsubishi', 'Audi', 'ford', 'Geo', 'Hyundai', 'jeep', 'FIAT', 'Acura', 'dodge', 'hyundai', 'oldsmobile', 'acura', 'hyundai tk', 'Porsche', 'gmc truck', 'subaru', 'mercury', 'Mercury', 'Ram', 'MINI', 'Plymouth', 'cadillac', 'dodge tk', 'Saab', 'Maserati', 'maserati', 'porsche', 'Lamborghini', 'Dodge', 'Jeep', 'airstream', 'mitsubishi', 'ford truck', 'Saturn', 'buick', 'Mazda', 'Oldsmobile', 'suzuki', 'chevrolet', 'mercedes', 'chev truck', 'audi', 'Volvo', 'HUMMER', 'gmc', 'Lexus', 'BMW', 'Tesla', 'Kia', 'Rolls-Royce', 'Lincoln', 'Daewoo', 'Land Rover', 'Honda', 'lincoln', 'Fisker', 'nissan', 'Ferrari', 'Suzuki', 'Infiniti', 'plymouth', 'Bentley', 'Ford', 'Pontiac', 'Cadillac', 'Mercedes-Benz', 'landrover', 'GMC', 'Volkswagen', 'pontiac', 'Nissan', 'kia', 'toyota', 'land rover', 'vw', 'honda', 'chrysler', 'mazda', 'Jaguar', 'mercedes-b', 'bmw', 'Isuzu', 'Aston Martin', 'Subaru', 'Toyota', 'Buick', 'dot', 'Chrysler', 'smart', 'Scion', 'lexus', 'Chevrolet']
body