In [39]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from tensorflow import keras


In [40]:

np.random.seed(42)
n_samples = 1000
df = pd.read_csv('sredjeni.csv')
X = df.drop(columns=["Cena"])
y = df["Cena"]  

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


def evaluate(model, X_val, y_val, name):
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    log_rmse = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(preds)))
    mape = mean_absolute_percentage_error(y_val, preds)
    print(f"{name} -> RMSE: {rmse:.2f}, log-RMSE: {log_rmse:.4f}, MAPE: {mape:.4f}")



In [41]:
#najobicniji
rf = RandomForestRegressor(n_estimators=200, random_state=42) 
rf.fit(X_train, y_train)
evaluate(rf, X_val, y_val, "Random Forest")

Random Forest -> RMSE: 2637.95, log-RMSE: 0.2452, MAPE: 0.1947


In [42]:
#probamo da vidimo importance svakog parametra >>>>> da optimizujemo 
importances = rf.feature_importances_
feature_names = X_train.columns
sorted_indices = np.argsort(importances)[::-1]
for i in sorted_indices:
    print(f"Feature: {feature_names[i]}, Importance: {importances[i]}")

Feature: Godište, Importance: 0.46994420198274856
Feature: Kubikaža, Importance: 0.2649555383360164
Feature: Pogon, Importance: 0.08509442331761072
Feature: Kilometraža, Importance: 0.06090477056753219
Feature: Equipment_Feature_Count, Importance: 0.03940889015404933
Feature: Marka, Importance: 0.025827457734116363
Feature: Menjač_Manuelni, Importance: 0.019052202525923108
Feature: Boja, Importance: 0.009910515431194745
Feature: Materijal enterijera, Importance: 0.009651951585517382
Feature: Safety_Feature_Count, Importance: 0.00848083128911713
Feature: Automatska_Klima, Importance: 0.00374080318335944
Feature: Gorivo, Importance: 0.0023724344803150456
Feature: Cetiri_Vrata, Importance: 0.0006559794124994931


In [43]:
#probavamo da izbacimo parametre koji su slab importance, poboljsanje bude minimalno. Rezultati su sa i bez korekcije slicni
#rezultatima u radovima.

np.random.seed(42)
n_samples = 1000
df = pd.read_csv('sredjeni.csv')
X_rf = df.drop(columns=["Cena","Cetiri_Vrata", "Gorivo","Automatska_Klima","Safety_Feature_Count","Materijal enterijera","Boja"])
y_rf = df["Cena"]  

X_train_rf, X_temp_rf, y_train_rf, y_temp_rf = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42)
X_val_rf, X_test_rf, y_val_rf, y_test_rf = train_test_split(X_temp_rf, y_temp_rf, test_size=0.5, random_state=42)

In [44]:
#najobicniji ali pokrenut nad manje parametara
rf = RandomForestRegressor(n_estimators=200, random_state=42) 
rf.fit(X_train_rf, y_train_rf)
evaluate(rf, X_val_rf, y_val_rf, "Random Forest")

Random Forest -> RMSE: 2650.82, log-RMSE: 0.2416, MAPE: 0.1905


In [51]:

#unapredjeni rf koji trazi najbolji rezultat sa kombinacijamam parametara. koristimo sada sve parametre
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

rf = RandomForestRegressor(random_state=42)

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='neg_mean_absolute_percentage_error', # Use MAPE as the scoring metric
    verbose=2,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train)

best_rf = random_search.best_estimator_
evaluate(best_rf, X_val, y_val, "Tuned Random Forest")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Tuned Random Forest -> RMSE: 2567.90, log-RMSE: 0.2414, MAPE: 0.1908


In [52]:

#unapredjeni rf koji trazi najbolji rezultat sa kombinacijamam parametara. koristimo samo odre]ene parametre -- ovdje smo dobili najbolji rezultat za rf

random_search.fit(X_train_rf, y_train_rf)

best_rf = random_search.best_estimator_
evaluate(best_rf, X_val_rf, y_val_rf, "Tuned Random Forest")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Tuned Random Forest -> RMSE: 2632.33, log-RMSE: 0.2405, MAPE: 0.1884


In [53]:
#najobicniji xgboost
#radimo sa cijelim skupom kolona jer on ima parametar colsample
xgb_model = xgb.XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
evaluate(xgb_model, X_val, y_val, "XGBoost")



XGBoost -> RMSE: 2607.09, log-RMSE: 0.2605, MAPE: 0.1871


In [54]:
# optimiyacija  hiperparametara MAPE 0.17
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'min_child_weight': [1, 2, 3, 4]
}

xgb_model = xgb.XGBRegressor(random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,
    cv=5,
    scoring='neg_mean_absolute_percentage_error',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

best_xgb_model = random_search.best_estimator_

evaluate(best_xgb_model, X_val, y_val, "Optimized XGBoost")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Optimized XGBoost -> RMSE: 2529.35, log-RMSE: 0.2256, MAPE: 0.1768


In [55]:
#ocekivali smo da ann ima gore rezultate, ali smo se prijatno iznenadili. Krenuli smo sa greskom MAPE 0.6, i dosli do 0.2
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1)
])

model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32, verbose=1)

preds = model.predict(X_val).flatten()
rmse = np.sqrt(mean_squared_error(y_val, preds))
log_rmse = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(preds)))
mape = mean_absolute_percentage_error(y_val, preds)
print(f"Feed-forward ANN -> RMSE: {rmse:.2f}, log-RMSE: {log_rmse:.4f}, MAPE: {mape:.4f}")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 149039488.0000 - val_loss: 136170576.0000
Epoch 2/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 128123616.0000 - val_loss: 60235112.0000
Epoch 3/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 53830944.0000 - val_loss: 44024504.0000
Epoch 4/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 44359080.0000 - val_loss: 35767912.0000
Epoch 5/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 32615912.0000 - val_loss: 29585968.0000
Epoch 6/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 30360646.0000 - val_loss: 24718084.0000
Epoch 7/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 24019928.0000 - val_loss: 21652396.0000
Epoch 8/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 