# ridge regression 

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

data = pd.read_csv('healthcare-dataset-stroke-data.csv')

data = data.drop(columns=['id'])
data = pd.get_dummies(data, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())

X = data.drop(columns=['stroke'])
y = data['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

alphas = [0.01, 0.1, 1, 10, 100, 1000]
coefficients = []
r2_scores = []

# train
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    
    coefficients.append(ridge.coef_)
    r2_scores.append(r2_score(y_test, ridge.predict(X_test)))
    
    print(f"Alpha: {alpha}")
    print("Coefficients:", ridge.coef_)
    print("R-squared:", r2_scores[-1])
    print("")

# show as a table
results = pd.DataFrame({
    'Alpha': alphas,
    'R-squared': r2_scores,
    'Coefficients': coefficients
})

print(results)

Alpha: 0.01
Coefficients: [ 0.00302191  0.0320341   0.04845694  0.00025417 -0.00044898 -0.00205889
  0.         -0.03638937  0.03611166  0.01588019 -0.0050418   0.06352301
  0.00333902 -0.00434043 -0.00569536  0.00132025]
R-squared: 0.0952680234658645

Alpha: 0.1
Coefficients: [ 0.00302165  0.03202767  0.04843737  0.0002542  -0.00044932 -0.0020563
  0.         -0.03638933  0.03591851  0.01585936 -0.00505614  0.06347435
  0.00333901 -0.00434399 -0.0056999   0.00131377]
R-squared: 0.09526722882446437

Alpha: 1
Coefficients: [ 0.00301915  0.03196313  0.04824203  0.00025453 -0.00045258 -0.00203082
  0.         -0.03638721  0.03408653  0.01565658 -0.0051951   0.06299871
  0.00333864 -0.0043786  -0.00574451  0.00125065]
R-squared: 0.09525905577054317

Alpha: 10
Coefficients: [ 0.00299783  0.0313096   0.0463431   0.00025756 -0.00048154 -0.00181078
  0.         -0.03624876  0.0222215   0.01401231 -0.0062681   0.0590134
  0.00331894 -0.00464524 -0.00612559  0.00074059]
R-squared: 0.095163164515

# quadratic reg

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

data = pd.read_csv('healthcare-dataset-stroke-data.csv')

X = data.drop(columns=['id', 'stroke'])
y = data['stroke']

categorical_cols = ['gender', 'work_type', 'Residence_type', 'smoking_status']
numerical_cols = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())]), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

poly = PolynomialFeatures(degree=2)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', poly),
    ('linear', LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train
model.fit(X_train, y_train)

train_r_squared = model.score(X_train, y_train)
print(f'R-squared value on the training set: {train_r_squared:.4f}')

R-squared value on the training set: 0.1077
