In [25]:
# Standard Libraries
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn Modules
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.exceptions import ConvergenceWarning

# Linear Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# Tree-based Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Neighbors and SVM
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Boosting Libraries
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Pandas Display Settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Warning Configurations
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)

In [26]:
df = pd.read_csv("cleaned_hitters.csv")

In [27]:
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293.0,66.0,1.0,30.0,29.0,14.0,1.0,293.0,66.0,1.0,30.0,29.0,14.0,A,E,446.0,33.0,20.0,152.838,A
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,N,W,632.0,43.0,10.0,475.0,N
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,A,W,880.0,82.0,14.0,480.0,A
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,N,E,200.0,11.0,3.0,500.0,N
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,N,E,805.0,40.0,4.0,91.5,N


In [28]:
df.columns

Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',
       'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division',
       'PutOuts', 'Assists', 'Errors', 'Salary', 'NewLeague'],
      dtype='object')

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AtBat      322 non-null    float64
 1   Hits       322 non-null    float64
 2   HmRun      322 non-null    float64
 3   Runs       322 non-null    float64
 4   RBI        322 non-null    float64
 5   Walks      322 non-null    float64
 6   Years      322 non-null    float64
 7   CAtBat     322 non-null    float64
 8   CHits      322 non-null    float64
 9   CHmRun     322 non-null    float64
 10  CRuns      322 non-null    float64
 11  CRBI       322 non-null    float64
 12  CWalks     322 non-null    float64
 13  League     322 non-null    object 
 14  Division   322 non-null    object 
 15  PutOuts    322 non-null    float64
 16  Assists    322 non-null    float64
 17  Errors     322 non-null    float64
 18  Salary     322 non-null    float64
 19  NewLeague  322 non-null    object 
dtypes: float64

In [30]:
X = df.drop('Salary', axis=1)  # Bağımsız değişkenler (Salary hariç tüm sütunlar)
y = df['Salary']

In [31]:
cat_cols = ['League', 'Division', 'NewLeague']
num_cols = [col for col in X.columns if col not in cat_cols]

In [32]:
for col in cat_cols:
    X[col] = X[col].astype('category')

In [33]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ])

# Modeller
models = [
    ('LR', LinearRegression()),
    ("Ridge", Ridge(random_state=17)),
    ("Lasso", Lasso(random_state=17)),
    ("ElasticNet", ElasticNet(random_state=17)),
    ('KNN', KNeighborsRegressor()),
    ('CART', DecisionTreeRegressor(random_state=17)),
    ('RF', RandomForestRegressor(random_state=17)),
    ('GBM', GradientBoostingRegressor(random_state=17)),
    ("XGBoost", XGBRegressor(objective='reg:squarederror', random_state=17)),
    ("LightGBM", LGBMRegressor(random_state=17))
]


In [34]:
results = []
for name, model in models:
    try:
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        
        cv_scores = cross_val_score(pipeline, X, y, 
                                 cv=5, 
                                 scoring="neg_mean_squared_error",
                                 n_jobs=-1)
        
        rmse_scores = np.sqrt(-cv_scores)
        mean_rmse = np.mean(rmse_scores)
        std_rmse = np.std(rmse_scores)
        
        results.append((name, mean_rmse, std_rmse))
        print(f"{name:10} | RMSE: {mean_rmse:.2f} ± {std_rmse:.2f}")
        
    except Exception as e:
        print(f"{name} modelinde hata: {str(e)}")
        results.append((name, None, None))

# Sonuçları DataFrame'e çevir
results_df = pd.DataFrame(results, columns=['Model', 'RMSE', 'Std'])
print("\nModel Performans Karşılaştırması:")
print(results_df.sort_values('RMSE'))

# En iyi modeli seç ve tüm veriyle fit et
best_model_name = results_df.loc[results_df['RMSE'].idxmin(), 'Model']
best_model = [model for name, model in models if name == best_model_name][0]

final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', best_model)
])

final_pipeline.fit(X, y)
print(f"\nEn iyi model: {best_model_name} - Tüm veriyle eğitildi.")

LR         | RMSE: 296.63 ± 49.16
Ridge      | RMSE: 295.29 ± 51.88
Lasso      | RMSE: 295.33 ± 52.14
ElasticNet | RMSE: 305.85 ± 63.98
KNN        | RMSE: 303.94 ± 50.01
CART       | RMSE: 385.63 ± 39.40
RF         | RMSE: 276.54 ± 50.76
GBM        | RMSE: 275.76 ± 53.02
XGBoost    | RMSE: 305.79 ± 46.56
LightGBM   | RMSE: 264.06 ± 58.14

Model Performans Karşılaştırması:
        Model    RMSE    Std
9    LightGBM 264.059 58.140
7         GBM 275.762 53.025
6          RF 276.538 50.763
1       Ridge 295.292 51.881
2       Lasso 295.332 52.143
0          LR 296.630 49.157
4         KNN 303.937 50.010
8     XGBoost 305.786 46.559
3  ElasticNet 305.850 63.983
5        CART 385.627 39.403
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1236
[LightGBM] [Info] Number of data points in the train set: 322, number of used features: 22
[LightGBM] [Info] S

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

# Pipeline oluşturma
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Modeli eğitme
pipeline.fit(X_train, y_train)

# Tahminler
y_pred = pipeline.predict(X_test)

# 1. Temel Metrikler
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("╔══════════════════════════╗")
print("║   MODEL PERFORMANSI      ║")
print("╠══════════════╦═══════════╣")
print(f"║ RMSE         ║ {rmse:.2f}  ║")
print(f"║ MSE          ║ {mse:.2f} ║")
print(f"║ MAE          ║ {mae:.2f}  ║")
print(f"║ R² Score     ║ {r2:.4f}  ║")
print("╚══════════════╩═══════════╝")

╔══════════════════════════╗
║   MODEL PERFORMANSI      ║
╠══════════════╦═══════════╣
║ RMSE         ║ 362.53  ║
║ MSE          ║ 131429.45 ║
║ MAE          ║ 273.01  ║
║ R² Score     ║ 0.5490  ║
╚══════════════╩═══════════╝


In [36]:
# 2. Katsayı Analizi
# OneHotEncoder'dan sonraki feature isimlerini al
cat_features = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(cat_cols)
all_features = np.concatenate([num_cols, cat_features])

# Katsayıları DataFrame'e çevir
coef_df = pd.DataFrame({
    'Feature': all_features,
    'Coefficient': pipeline.named_steps['model'].coef_
}).sort_values('Coefficient', ascending=False)

print("\nEn Önemli 5 Pozitif Etki:")
print(coef_df.head(5))
print("\nEn Önemli 5 Negatif Etki:")
print(coef_df.tail(5))


En Önemli 5 Pozitif Etki:
    Feature  Coefficient
8     CHits      768.041
1      Hits      216.140
9    CHmRun      214.576
5     Walks      152.458
14  Assists       71.176

En Önemli 5 Negatif Etki:
   Feature  Coefficient
3     Runs      -74.306
11    CRBI      -91.877
12  CWalks     -167.277
0    AtBat     -194.351
7   CAtBat     -525.645
