In [1]:
import pandas as pd

In [2]:
medical_df = pd.read_csv('data/medical.csv')

In [21]:
medical_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [None]:
medical_df.info()

Görünüşe göre "yaş", "çocuklar", "bmi" (vücut kitle indeksi) ve "ücretler" sayılardır; "cinsiyet", "sigara içen" ve "bölge" ise dizelerdir (muhtemelen kategoriler). 

Sütunların hiçbiri eksik değer içermiyor, bu da bizi büyük bir iş tasarrufundan kurtarıyor!

Sayısal sütunlara ilişkin bazı istatistikler şunlardır:

In [None]:
medical_df.describe()

In [None]:
!pip3 install plotly matplotlib seaborn --quiet

In [None]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# sns.set_style('darkgrid')
# matplotlib.rcParams['font.size'] = 14
# matplotlib.rcParams['figure.figsize'] = (10, 6)
# matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
medical_df.age.describe()

In [None]:
fig = px.histogram(medical_df, 
                   x='age', 
                   marginal='box', 
                   nbins=47, 
                   title='Yaş Dağılımı')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.histogram(medical_df, 
                   x='bmi', 
                   marginal='box', 
                   color_discrete_sequence=['red'], 
                   title='Distribution of BMI (Body Mass Index)')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.histogram(medical_df, 
                   x='region', 
                   y="charges",
                   marginal='box', 
                   color='smoker', 
                   color_discrete_sequence=['green', 'grey', "blue", "black"], 
                   title='Yıllık Medikal Ücretler')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
medical_df.smoker.value_counts()

In [None]:
px.histogram(medical_df, x='smoker', color='sex', title='Sigara İçenler')

In [None]:
fig = px.scatter(medical_df, 
                 x='age', 
                 y='charges', 
                 color='smoker', 
                 opacity=0.8, 
                 hover_data=['sex'], 
                 title='Yaş vs. Ücretler')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
fig = px.scatter(medical_df, 
                 x='bmi', 
                 y='charges', 
                 color='smoker', 
                 opacity=0.8, 
                 hover_data=['sex'], 
                 title='BMI vs. Ücretler')
fig.update_traces(marker_size=5)
fig.show()

In [None]:
sns.violinplot(medical_df, x="age", y="region")

In [None]:
sns.violinplot(medical_df, x="children", y="sex")

In [None]:
sns.violinplot(medical_df, x="children", y="smoker", hue="region")

In [None]:
sns.barplot(medical_df, x="sex", y="children")

In [None]:
medical_df.charges.corr(medical_df.age)

In [None]:
medical_df.charges.corr(medical_df.bmi)

In [None]:
smoker_values = {'no': 0, 'yes': 1}
smoker_numeric = medical_df.smoker.map(smoker_values)
medical_df.charges.corr(smoker_numeric)

In [None]:
medical_df.corr(numeric_only = True)

In [None]:
sns.heatmap(medical_df.corr(numeric_only = True), cmap='Greens', annot=True)
plt.title('Correlation Matrix')

In [None]:
non_smoker_df = medical_df[medical_df.smoker == 'no']
non_smoker_df

In [None]:
plt.title("Yaş vs. Ücretler")
sns.scatterplot(data=non_smoker_df, x="age", y="charges", alpha=0.7, s=15)

In [None]:
!pip install scikit-learn --quiet

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

model = LinearRegression()

In [None]:
inputs, targets = non_smoker_df[["age", "bmi"]], non_smoker_df.charges
X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=42)

In [None]:
linear_model = model.fit(X_train, y_train)
# Katsayıları ve sabit terimi al
coefficients = linear_model.coef_
intercept = linear_model.intercept_

In [None]:
coefficients

In [None]:
intercept

In [None]:
predictions = linear_model.predict(X_test)

In [None]:
fig = px.scatter(x=X_test.age, y=y_test, title="BMI vs. Ücretler")
fig.show()

In [None]:
fig = px.scatter_3d(non_smoker_df, x="age", y="bmi", z="charges")
fig.update_traces(marker_size=3, marker_opacity=.5)
fig.show()

In [None]:
inputs, targets = non_smoker_df[["bmi"]], non_smoker_df.charges
model = model.fit(inputs, targets)

In [None]:
model.coef_, model.intercept_

In [None]:
predictions = model.predict(inputs)

In [None]:
smoker_values = {'no': 0, 'yes': 1}
medical_df["smoker_code"] = medical_df.smoker.map(smoker_values)

In [None]:
medical_df.charges.corr(medical_df["smoker_code"])

In [None]:
sex_codes = {'female': 0, 'male': 1}
medical_df["sex_code"] = medical_df.sex.map(sex_codes)

In [None]:
medical_df.charges.corr(medical_df["sex_code"])

In [None]:
inputs, targets = medical_df[["age", "bmi","children", "smoker_code", "sex_code"]], medical_df["charges"]
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
loss = rmse(targets, predictions)
loss

In [None]:
sns.barplot(medical_df, x="sex", y="charges")

In [None]:
sns.barplot(medical_df, x="region", y="charges")

In [None]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder()
enc.fit(medical_df[["region"]])
enc.categories_

In [None]:
one_hot = enc.transform(medical_df[["region"]]).toarray()
one_hot

In [None]:
medical_df[['northeast', 'northwest', 'southeast', 'southwest']] = one_hot
medical_df

In [None]:
# inputs = medical_df.drop(columns=["age", "sex", "smoker", "region"])
input_cols = ['age','bmi','children','smoker_code','sex_code', 'northeast', 'northwest', 'southeast', 'southwest']
inputs = medical_df[input_cols]
targets = medical_df["charges"]
model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)
loss = rmse(targets, predictions)
loss

In [None]:
weights_df = pd.DataFrame({
  'feature': np.append(input_cols, "Sabit"),
  'weight': np.append(model.coef_, model.intercept_)
})
weights_df

In [None]:
from sklearn.preprocessing import StandardScaler
numeric_cols = ["age", "bmi", "children"]
scaler = StandardScaler()
scaler.fit(medical_df[numeric_cols])

In [None]:
scaler.mean_

In [None]:
scaler.var_

In [None]:
scaled_inputs = scaler.transform(medical_df[numeric_cols])
scaled_inputs

In [None]:
cat_cols = ["smoker_code", "sex_code", 'northeast', 'northwest', 'southeast', 'southwest']
categorical_data = medical_df[cat_cols].values

In [None]:
inputs = np.concatenate((scaled_inputs, categorical_data), axis=1)
targets = medical_df.charges

model = LinearRegression().fit(inputs, targets)

predictions = model.predict(inputs)

loss = rmse(targets, predictions)
loss

In [None]:
weights_df = pd.DataFrame({
  'feature': np.append(numeric_cols + cat_cols, "Sabit"),
  'weight': np.append(model.coef_, model.intercept_)
})
weights_df.sort_values("weight", ascending=False)

In [None]:
scaled_inputs, categorical_data

In [None]:
from sklearn.model_selection import train_test_split
inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs, targets, test_size=0.1, random_state=42)

In [None]:
model = LinearRegression().fit(inputs_train, targets_train)
predictions_test = model.predict(inputs_test)
loss = rmse(targets_test, predictions_test)
loss