In [13]:
import pandas as pd

df = pd.read_csv("data/processed/train_ready.csv")
df.sample(5)


Unnamed: 0,capacitate motor,putere,rulaj,an fabricatie,age_years,marca,brand_category,model_simplified,caroserie,combustibil,...,color_maro_/_bej,color_alta_culoare,color_verde,power_to_displacement,power_per_liter,mileage_per_year,model_frequency,log_mileage,log_engine_size,pret_log
2995,1499,75,203000,2013,11,ford,standard,Fiesta,Hatchback,Diesel,...,0,0,0,50.033356,50.033356,16916.666667,51,12.220966,7.31322,8.409831
1336,2000,150,300000,2008,16,chevrolet,budget,Captiva,Off-road,Diesel,...,0,0,0,75.0,75.0,17647.058824,74,12.611541,7.601402,8.575651
2654,1598,77,146945,2016,8,fiat,budget,UNKNOWN,Hatchback,Diesel,...,0,0,0,48.185232,48.185232,16327.222222,30,11.89782,7.377134,8.412055
286,2989,272,180000,2016,8,audi,premium,A6,Berlina,Diesel,...,0,0,0,91.000335,91.000335,20000.0,118,12.100718,8.003029,9.952325
1878,1600,105,230580,2011,13,dacia,budget,Duster,SUV,Benzina,...,0,0,0,65.625,65.625,16470.0,166,12.348357,7.378384,8.566174


In [14]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# -----------------------------
# 1. Split X / y (log target)
# -----------------------------
X = df.drop(columns=["pret_log"])
y_log = df["pret_log"]

# Train / test split
X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

# -----------------------------
# 2. Preprocessing (OneHot for categoricals)
# -----------------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ]
)

# -----------------------------
# 3. Pipeline: preproc + Linear Regression
# -----------------------------
pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

pipe.fit(X_train, y_train_log)

# -----------------------------
# 4. Predictions (log and EUR)
# -----------------------------
y_pred_log = pipe.predict(X_test)

# back-transform to price
y_test_price = np.expm1(y_test_log)
y_pred_price = np.expm1(y_pred_log)

# -----------------------------
# 5. Metrics
# -----------------------------
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# log-space
mse_log = mean_squared_error(y_test_log, y_pred_log)
rmse_log = np.sqrt(mse_log)
r2_log = r2_score(y_test_log, y_pred_log)

# price-space (EUR)
mse_price = mean_squared_error(y_test_price, y_pred_price)
rmse_price = np.sqrt(mse_price)
mae_price = mean_absolute_error(y_test_price, y_pred_price)
r2_price = r2_score(y_test_price, y_pred_price)

print("=== Metrics in LOG space (pret_log) ===")
print(f"RMSE_log: {rmse_log:.4f}")
print(f"R2_log:   {r2_log:.4f}")
print()
print("=== Metrics in PRICE space (EUR) ===")
print(f"RMSE_price: {rmse_price:,.2f} EUR")
print(f"MAE_price:  {mae_price:,.2f} EUR")
print(f"R2_price:   {r2_price:.4f}")

# -----------------------------
# 6. Sample predictions vs true
# -----------------------------
results = pd.DataFrame({
    "true_price": y_test_price,
    "pred_price": y_pred_price
})
print("\n=== Sample predictions (EUR) ===")
print(results.head(20).round(0))

# -----------------------------
# 7. Coefficients (top positive / negative)
# -----------------------------
# Get feature names after OneHot
ohe = pipe.named_steps["preprocessor"].named_transformers_["cat"]
ohe_features = ohe.get_feature_names_out(categorical_cols)
all_features = np.concatenate([ohe_features, np.array(numeric_cols)])

coefs = pipe.named_steps["regressor"].coef_

coef_df = pd.DataFrame({
    "feature": all_features,
    "coef": coefs
})

coef_df_sorted = coef_df.sort_values("coef", ascending=False)

print("\n=== Top 15 positive coefficients (increase log-price) ===")
print(coef_df_sorted.head(15))

print("\n=== Top 15 negative coefficients (decrease log-price) ===")
print(coef_df_sorted.tail(15))



=== Metrics in LOG space (pret_log) ===
RMSE_log: 0.3494
R2_log:   0.7740

=== Metrics in PRICE space (EUR) ===
RMSE_price: 3,930.28 EUR
MAE_price:  1,875.71 EUR
R2_price:   0.7439

=== Sample predictions (EUR) ===
      true_price  pred_price
3540     16099.0     11878.0
4140     11300.0     11844.0
4231      8150.0      7315.0
3282     38000.0      6303.0
5601      2999.0      3125.0
3413      6300.0      4485.0
6147      4900.0      4518.0
6851      5300.0      4776.0
8199      6600.0      5032.0
9273     12000.0     13202.0
6584     24400.0     19562.0
1018     11800.0     11748.0
7615      2990.0      3264.0
2691      5800.0      4443.0
9140      4150.0      4406.0
7071      5290.0      5198.0
5169      6999.0      6729.0
6126      3500.0      2530.0
6069      3980.0      4223.0
3060      2400.0      2738.0

=== Top 15 positive coefficients (increase log-price) ===
                              feature      coef
133                       low_hp_flag  0.451184
57              model