In [11]:
# Kütüphaneler eklendi
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [12]:
# Temizlenmiş veri seti okuunması
df = pd.read_csv('../data/processed/main_salary_dataset.csv')
print("Veri seti yüklendi")


Veri seti yüklendi


In [13]:
#Çözümleme
education_map = {
    "unknown": 0,
    "high_school": 1,
    "bachelor": 2,
    "master": 3,
    "phd": 4 
}
df['education_level_encoded'] = df['education_level'].map(education_map)

seniority_map = {
    'junior': 0,
    'senior': 1
}
df['seniority_level_encoded'] = df['seniority_level'].map(seniority_map)

In [14]:
# X: Maaşı tahmin etmek için kullanacağımız bilgiler
# y: Hedefimiz (Maaş)
X = df.drop(columns=['salary'])
y = df['salary']

In [15]:
# Veriyi %80 Eğitim, %20 Test diye ikiye bölünmesi
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Her mesleğin eğitim setindeki ortalama maaşının hesaplanması
job_title_means = y_train.groupby(X_train['job_title']).mean()
global_mean = y_train.mean()

In [17]:
def encode_job_title(title):
    return job_title_means.get(title, global_mean)

In [18]:
X_train['job_title_encoded'] = X_train['job_title'].apply(encode_job_title)
X_test['job_title_encoded'] = X_test['job_title'].apply(encode_job_title)

In [19]:
cols_to_use = [
    'age', 
    'years_experience', 
    'education_level_encoded', 
    'seniority_level_encoded', 
    'job_title_encoded'
]

X_train_final = X_train[cols_to_use]
X_test_final = X_test[cols_to_use]

In [20]:
# Model eğitimi (RANDOM FOREST)
print("Model eğitimine geçildi")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_final, y_train)
print("Eğitim tamamlandı")

Model eğitimine geçildi
Eğitim tamamlandı


In [22]:
# Test verileri ile modelin performansını kontrol edilimi
y_pred = model.predict(X_test_final)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [23]:
# Test sonuçları 
print(f"\n--- SONUÇLAR ---")
print(f"Ortalama Hata Payı (MAE): {mae:.2f}")
print(f"Başarı Skoru (R2): {r2:.4f}")


--- SONUÇLAR ---
Ortalama Hata Payı (MAE): 11546.42
Başarı Skoru (R2): 0.8904


In [24]:
# Hangi özellik ne kadar önemli?
feature_importance = pd.DataFrame({
    'feature': cols_to_use,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n--- ÖNEM DÜZEYLERİ ---")
print(feature_importance)


--- ÖNEM DÜZEYLERİ ---
                   feature  importance
1         years_experience    0.740641
4        job_title_encoded    0.185260
0                      age    0.048360
2  education_level_encoded    0.017661
3  seniority_level_encoded    0.008077


In [25]:
#Kayıt 
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

joblib.dump(model, os.path.join(models_dir, 'salary_model_target_encoded.pkl'))
joblib.dump(job_title_means, os.path.join(models_dir, 'job_title_means.pkl'))
joblib.dump(global_mean, os.path.join(models_dir, 'global_mean_salary.pkl'))

['../models\\global_mean_salary.pkl']