In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import model_selection
import warnings
warnings.filterwarnings("ignore")

##### Ridge Regresyon:
##### • L2 normu kullanarak katsayıları sıfıra yaklaştırır. L2: ortalama kare hatası + alpha*betaların karesi toplamı
##### • Aşırı öğrenmeye karşı dirençli
##### • Yanlılığı yüksek, varyansı düşüktür
##### • Çok parametreli verisetlerinde iyi performans gösterir

In [2]:
df = pd.read_csv("Hitters.csv")
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [3]:
df.isnull().sum()

AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64

In [4]:
df.dropna(inplace=True)

In [5]:
df = pd.get_dummies(df, columns=["League","Division","NewLeague"])

In [6]:
df.drop(["League_N", "Division_W", "NewLeague_N"], axis=1, inplace=True)

In [7]:
df.replace(to_replace=False, value=0, inplace=True)
df.replace(to_replace=True, value=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary,League_A,Division_E,NewLeague_A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0,0,0,0
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0,1,0,1
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0,0,1,0
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,91.5,0,1,0
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,750.0,1,0,1


In [9]:
X = df.drop("Salary", axis=1)
y = df[["Salary"]]
X.shape, y.shape

((263, 19), (263, 1))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=16)

In [11]:
ridge = Ridge(alpha=0.1) # 0.1 alpha değeri L2 çarpanıdır

In [12]:
ridge.fit(X_train, y_train)

In [13]:
ridge.intercept_ # sabit b0

array([151.29063953])

In [14]:
ridge.coef_ # 19 adet sütunların katsayıları

array([[-1.82990573e+00,  8.89148160e+00,  4.47203442e+00,
        -7.05213350e+00, -1.36014517e+00,  7.05845213e+00,
        -1.26359400e+01, -3.73020190e-02, -5.95076815e-01,
        -1.37640119e+00,  1.97216619e+00,  1.37441058e+00,
        -1.10570924e+00,  2.08721586e-01,  3.13307455e-01,
        -3.81258218e+00, -5.40994584e+01,  1.17678674e+02,
         2.24291432e+01]])

In [15]:
y_tahmin = ridge.predict(X_train)

In [16]:
# train hatası
rmse = np.sqrt(mean_squared_error(y_tahmin, y_train))
rmse

312.68181963533493

In [49]:
# cross validation ile train hatası
np.sqrt(np.mean(-cross_val_score(ridge, X_train, y_train, cv=10, scoring="neg_mean_squared_error")))

367.54653270211446

In [18]:
y_test_tahmin = ridge.predict(X_test)

In [19]:
# test hatası
np.sqrt(mean_squared_error(y_test_tahmin, y_test))

296.8886545341516

### Model Tuning (Model Doğrulama)

In [79]:
lam_vals = 10**np.linspace(5,-2,100)*0.5

In [80]:
ridgecv = RidgeCV(alphas=lam_vals, scoring="neg_mean_squared_error", cv=10)
ridgecv.fit(X_train, y_train)

In [81]:
ridgecv.alpha_

7087.370814634024

In [82]:
# Bulunan en uygun alpha değeri ile final modeli oluşturma

In [83]:
ridge_tuned = Ridge(alpha=ridgecv.alpha_).fit(X_train, y_train)

In [84]:
y_pred = ridge_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_pred, y_test)) # tuning edilen modelin hatası

291.55460060762766