In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor

In [20]:
data_file_name = "output_file.csv"

In [21]:
# Veriyi data frame nesnesi olarak içeri aktarıyoruz
base_df = pd.read_csv(data_file_name, encoding='utf-8')
base_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46234 entries, 0 to 46233
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Yıl                46234 non-null  int64  
 1   Model              46234 non-null  int64  
 2   Renk               46234 non-null  int64  
 3   Boya-değişen       46234 non-null  float64
 4   Çekiş              46234 non-null  float64
 5   Arka Tampon        46234 non-null  float64
 6   Sağ Ön Çamurluk    46234 non-null  float64
 7   Sol Ön Çamurluk    46234 non-null  float64
 8   Kilometre          46234 non-null  int64  
 9   Fiyat              46234 non-null  int64  
 10  Marka              46234 non-null  int64  
 11  Yakıt Tipi         46234 non-null  float64
 12  Vites Tipi         46234 non-null  float64
 13  Kasa Tipi          46234 non-null  float64
 14  Ön Tampon          46234 non-null  float64
 15  Arka Kaput         46234 non-null  float64
 16  Sol Arka Kapı      462

In [22]:
useful_df = base_df.drop(['_id'], axis=1)

In [23]:
groups = useful_df.groupby("Marka")

max_group_size = groups.size().max()


balanced_data = []
for name, group in groups:
    if len(group) < max_group_size:
    
        additional_data = group.sample(n=max_group_size - len(group), replace=True)
        group = pd.concat([group, additional_data], axis=0)
    balanced_data.append(group)

balanced_df = pd.concat(balanced_data)


cleaned_data = []
for name, group in balanced_df.groupby("Marka"):
    Q1 = group["Fiyat"].quantile(0.25)
    Q3 = group["Fiyat"].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers_index = group[(group["Fiyat"] < lower_bound) | (group["Fiyat"] > upper_bound)].index
    cleaned_group = group.drop(outliers_index)
    cleaned_data.append(cleaned_group)


ready_to_train = pd.concat(cleaned_data)


In [24]:
X = ready_to_train.drop("Fiyat", axis = 1)
y = ready_to_train["Fiyat"]


X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 12
                                                    )

scaler_X_test = MinMaxScaler() # bu minmaxscaler işlemi sadece yapay sinir ağı eğitiminde kullanmak için var
scaler_X_train = MinMaxScaler() # yapay sinir ağına vermek için verilerin normalize edilmiş versiyonunu saklıyorum

scaled_X_test = pd.DataFrame(scaler_X_test.fit_transform(X_test), columns=X.columns)
scaled_X_train = pd.DataFrame(scaler_X_train.fit_transform(X_train), columns=X.columns)

In [25]:
model_xgb = XGBRegressor()
model_xgb.fit(X_train, y_train)

In [26]:
train_predictions_xgb = model_xgb.predict(X_train)
test_predictions_xgb = model_xgb.predict(X_test)
train_rmse_xgb = mean_squared_error(y_train, train_predictions_xgb, squared=False)
test_rmse_xgb = mean_squared_error(y_test, test_predictions_xgb, squared=False)

print("Eğitim RMSE:", train_rmse_xgb)
print("Test RMSE:", test_rmse_xgb)

Eğitim RMSE: 82911.34933535417
Test RMSE: 101936.05690042331




In [27]:
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# KFold cross-validation için ayarları belirleyin
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

train_rmse_scores = []
test_rmse_scores = []

for train_index, test_index in kf.split(X_train):
    X_train_kf, X_val_kf = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_kf, y_val_kf = y_train.iloc[train_index], y_train.iloc[test_index]
    
    model_xgb = XGBRegressor()
    model_xgb.fit(X_train_kf, y_train_kf)
    
    train_predictions_xgb = model_xgb.predict(X_train_kf)
    test_predictions_xgb = model_xgb.predict(X_val_kf)
    
    train_rmse = mean_squared_error(y_train_kf, train_predictions_xgb, squared=False)
    test_rmse = mean_squared_error(y_val_kf, test_predictions_xgb, squared=False)
    
    print(f"Train : {train_rmse}")
    print(f"Test : {test_rmse}")
    
    train_rmse_scores.append(train_rmse)
    test_rmse_scores.append(test_rmse)

# Ortalama RMSE değerlerini hesaplayın
avg_train_rmse = np.mean(train_rmse_scores)
avg_test_rmse = np.mean(test_rmse_scores)

print(f"Ortalama Eğitim RMSE: {avg_train_rmse}")
print(f"Ortalama Test RMSE: {avg_test_rmse}")




Train : 90806.85149715249
Test : 106006.46536745034




Train : 85718.98167996315
Test : 97304.81698970642




Train : 79895.19910243641
Test : 87545.68475590005




Train : 81065.46695699508
Test : 227007.29350779072
Train : 87686.75173441564
Test : 100927.57295407927
Ortalama Eğitim RMSE: 85034.65019419257
Ortalama Test RMSE: 123758.36671498537




In [28]:
# RandomForestRegressor
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)

In [29]:
train_predictions_rf = model_rf.predict(X_train)
test_predictions_rf = model_rf.predict(X_test)
train_rmse_rf = mean_squared_error(y_train, train_predictions_rf, squared=False)
test_rmse_rf = mean_squared_error(y_test, test_predictions_rf, squared=False)

print("Eğitim RMSE:", train_rmse_rf)
print("Test RMSE:", test_rmse_rf)

Eğitim RMSE: 34067.8834687848
Test RMSE: 76850.8054800351


