In [147]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

Bir masadaki kişi sayısı, ödenen ücret, masadaki kişilerin sigara içme durumu, öğle veya akşam yemeği olma durumlarına bakarak yemek sonunda verilen bahşişi tahmin etmeye çalışacağız.

total_bill: Ödenen ücret | 
tip: Bahşiş | 
sex: Cinsiyet | 
smoker: Sigara içme durumu | 
day: Günler | 
time: Öğle Yemeği, Akşam Yemeği | 
size: Masadaki kişi sayısı

In [113]:
df = sns.load_dataset("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [114]:
le = LabelEncoder() # kategorik değişkenleri sayısal değişkenlere dönüştürme
df["smoker_le"] = le.fit_transform(df["smoker"])
df["time_le"] = le.fit_transform(df["time"])
df["sex_le"] = le.fit_transform(df["sex"])

In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   smoker_le   244 non-null    int32   
 8   time_le     244 non-null    int32   
 9   sex_le      244 non-null    int32   
dtypes: category(4), float64(2), int32(3), int64(1)
memory usage: 10.2 KB


In [116]:
df = df.select_dtypes(include=["float64", "int64", "int32"])

In [117]:
X = df.drop("tip", axis=1)
y = df[["tip"]]
X.shape, y.shape

((244, 5), (244, 1))

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=24)

In [119]:
from statsmodels.api import OLS

In [120]:
lm = OLS(y, X)

In [121]:
model = lm.fit()

In [122]:
model.summary() # çoklu doğrusal model ile ilgili istatiksel açıklamalar

0,1,2,3
Dep. Variable:,tip,R-squared (uncentered):,0.903
Model:,OLS,Adj. R-squared (uncentered):,0.901
Method:,Least Squares,F-statistic:,446.2
Date:,"Thu, 23 Nov 2023",Prob (F-statistic):,5.28e-119
Time:,13:08:22,Log-Likelihood:,-352.67
No. Observations:,244,AIC:,715.3
Df Residuals:,239,BIC:,732.8
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
total_bill,0.0987,0.010,10.328,0.000,0.080,0.118
size,0.3278,0.076,4.327,0.000,0.179,0.477
smoker_le,0.0477,0.135,0.355,0.723,-0.217,0.313
time_le,0.1652,0.143,1.155,0.249,-0.117,0.447
sex_le,0.1268,0.133,0.951,0.343,-0.136,0.389

0,1,2,3
Omnibus:,13.777,Durbin-Watson:,2.077
Prob(Omnibus):,0.001,Jarque-Bera (JB):,26.785
Skew:,0.26,Prob(JB):,1.53e-06
Kurtosis:,4.538,Cond. No.,49.7


In [81]:
lr = LinearRegression()

In [123]:
model = lr.fit(X_train, y_train) # model eğitimi

In [124]:
model.intercept_ # b0 sabiti

array([0.48670116])

In [125]:
model.coef_ # b1, b2, b3, b4, b5 katsayıları

array([[ 0.10565954,  0.20586485, -0.19035164,  0.04839082,  0.02993823]])

In [126]:
masa1 = np.array([[30, 3, 0, 1, 1]])  # x1=total_bill, x2=size, x3=smoker_le, x4=time_le, x5=sex_le 

In [128]:
# 30tl ücret ödeyen[30], 3 kişi gelen[3], sigara içmeyen[0], akşam yemeğinde[1], bir kadın[1] müşterinin verebileceği bahşiş miktarı
tip1 = model.intercept_ + np.sum(model.coef_ * masa1) # y = b0 + b1*x1 + b2*x2 + b3*x3 + b4*x4 + b5*x5
tip1 # y

array([4.35241081])

In [129]:
model.predict(masa1) # hazır tahmin fonskiyonu ile tahmin

array([[4.35241081]])

In [145]:
y_tahmin = model.predict(X_test)

In [146]:
# test hatası
mse = np.sqrt(mean_squared_error(y_test, y_tahmin))
mse

1.0014122994879189

In [154]:
# çapraz doğrulama ile ortalama hata
np.mean(-cross_val_score(model, X_train, y_train, cv=10, scoring="neg_mean_squared_error"))

1.1406242756516165