# Price Prediction, Anomaly Detection & Clustering

Dataset: kolesa_ml_ready.csv

In [2]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.cluster import KMeans

from catboost import CatBoostRegressor
import shap
import matplotlib.pyplot as plt


In [13]:

df = pd.read_csv('../data/processed/kolesa_cleaned.csv')
df.head()


Unnamed: 0,title,price,description,city,views,url,year,mileage,engine_l,fuel,gearbox,car_age,price_per_year,price_per_km,price_z
0,ВАЗ (Lada) Lada 2121,2000000.0,"2008 г., Б/у внедорожник, 1.7 л, бензин, КПП м...",Алматы,0,https://kolesa.kz/a/show/207117446?search_id=9...,2008.0,160000.0,1.7,petrol,manual,17.0,117647.058824,12.5,-0.596179
1,Mitsubishi Delica,8500000.0,"1995 г., Б/у минивэн, 3 л, бензин, Правый руль...",Алматы,0,https://kolesa.kz/a/show/205868127?search_id=9...,1995.0,220000.0,3.0,petrol,automatic,30.0,283333.333333,38.636364,-0.185337
2,BMW 528,6500000.0,"2012 г., Б/у седан, 2 л, бензин, КПП автомат, ...",Караганда,0,https://kolesa.kz/a/show/206593170?search_id=9...,2012.0,273000.0,2.0,petrol,automatic,13.0,500000.0,23.809524,-0.31175
3,Toyota Camry,6050000.0,"2011 г., Б/у седан, 2.5 л, бензин, КПП автомат...",Актау,0,https://kolesa.kz/a/show/207108864?search_id=9...,2011.0,174000.0,2.5,petrol,automatic,14.0,432142.857143,34.770115,-0.340192
4,Mazda 626,1800000.0,"1998 г., Б/у универсал, 1.8 л, бензин, КПП мех...",Шымкент,0,https://kolesa.kz/a/show/207105826?search_id=9...,1998.0,444444.0,1.8,petrol,manual,27.0,66666.666667,4.050004,-0.60882


In [12]:

# Check data types and non-numeric columns
print("Data shape:", df.shape)
print("\nData types:")
print(df.dtypes)
print("\nNon-numeric columns:")
non_numeric = df.select_dtypes(include=['object']).columns.tolist()
print(non_numeric)


Data shape: (2020, 15)

Data types:
title              object
price             float64
description        object
city               object
views               int64
url                object
year              float64
mileage           float64
engine_l          float64
fuel               object
gearbox            object
car_age           float64
price_per_year    float64
price_per_km      float64
price_z           float64
dtype: object

Non-numeric columns:
['title', 'description', 'city', 'url', 'fuel', 'gearbox']


## Feature Engineering

In [14]:

# Drop columns that shouldn't be used for modeling
df = df.drop(['title', 'description', 'url'], axis=1)

df['mileage'] = df['mileage'].fillna(df['mileage'].median())
df['engine_l'] = df['engine_l'].fillna(df['engine_l'].median())

df['mileage_per_year'] = df['mileage'] / df['car_age'].replace(0,1)

city_price = df.groupby('city')['price'].mean()
df['city_price_index'] = df['city'].map(city_price)

df['price_per_km'] = df['price'] / df['mileage'].replace(0,1)


In [21]:
df['price_per_year'] = df['price'] / df['car_age'].replace(0, 1)


## Encoding

In [22]:

for col in ['fuel','gearbox','city']:
    df[col] = LabelEncoder().fit_transform(df[col])


## Price Prediction

In [23]:

X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [24]:

# Check for infinity and NaN values
print("Infinity values:")
print((np.isinf(df)).sum())
print("\nNaN values:")
print(df.isna().sum())

# Replace infinity with NaN and then fill with median
df = df.replace([np.inf, -np.inf], np.nan)
for col in df.select_dtypes(include=[np.number]).columns:
    if df[col].isna().any():
        df[col].fillna(df[col].median(), inplace=True)


Infinity values:
price               0
city                0
views               0
year                0
mileage             0
engine_l            0
fuel                0
gearbox             0
car_age             0
price_per_year      0
price_per_km        0
price_z             0
mileage_per_year    0
city_price_index    0
dtype: int64

NaN values:
price               0
city                0
views               0
year                0
mileage             0
engine_l            0
fuel                0
gearbox             0
car_age             0
price_per_year      0
price_per_km        0
price_z             0
mileage_per_year    0
city_price_index    0
dtype: int64


In [None]:

rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

pred = rf.predict(X_test)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print("RMSE:", rmse)
print("RF R2:", r2_score(y_test, pred))


TypeError: got an unexpected keyword argument 'squared'

In [None]:

cat = CatBoostRegressor(iterations=300, learning_rate=0.1, depth=8, verbose=False)
cat.fit(X_train, y_train)

pred = cat.predict(X_test)
print("CatBoost RMSE:", mean_squared_error(y_test, pred, squared=False))
print("CatBoost R2:", r2_score(y_test, pred))


## SHAP Interpretation

In [None]:

explainer = shap.TreeExplainer(cat)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test)


## Anomaly Detection

In [None]:

iso = IsolationForest(contamination=0.05, random_state=42)
df['anomaly'] = iso.fit_predict(X)

df['z_price'] = (df['price'] - df['price'].mean()) / df['price'].std()
df[df['anomaly'] == -1].head()


## Clustering

In [None]:

cluster_features = df[['price','car_age','mileage','engine_l']]

kmeans = KMeans(n_clusters=4, random_state=42)
df['cluster'] = kmeans.fit_predict(cluster_features)

df.groupby('cluster')[['price','car_age','mileage']].mean()
