In [41]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [42]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

# Your existing data loading code remains the same
df = pd.read_csv('../data/raw_data/stroke_dataset.csv')

df = df.drop(['Patient ID', 'Patient Name'], axis=1)

df[['HDL', 'LDL']] = df["Cholesterol Levels"].str.split(",", expand=True)
df['HDL'] = df['HDL'].str.split(':', expand=True)[1]
df['LDL'] = df['LDL'].str.split(':', expand=True)[1]
df['HDL'] = pd.to_numeric(df['HDL'])
df['LDL'] = pd.to_numeric(df['LDL'])
df.head()

df['Upper BP'] = df['Blood Pressure Levels'].str.split('/',expand=True)[0]
df['Lower BP'] = df['Blood Pressure Levels'].str.split('/',expand=True)[1]
df['Upper BP'] = pd.to_numeric(df['Upper BP'])
df['Lower BP'] = pd.to_numeric(df['Lower BP'])


df = df.drop(['Blood Pressure Levels', 'Cholesterol Levels','Symptoms'], axis=1)

binary_cols = ['Gender', 'Marital Status', 'Family History of Stroke','Residence Type','Diagnosis']
for col in binary_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

onehot_cols = ['Smoking Status', 'Alcohol Intake', 'Physical Activity','Dietary Habits','Work Type']
encoder = OneHotEncoder(drop='first', sparse_output=False)

encoded_data = encoder.fit_transform(df[onehot_cols])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(onehot_cols))

df = df.drop(columns=onehot_cols)
df = pd.concat([df, encoded_df], axis=1)

print(df['Diagnosis'].value_counts())

X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.25,
                                                        random_state=42,
                                                        stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression
lr_classifier = LogisticRegression(random_state=42)
lr_classifier.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = lr_classifier.predict(X_test_scaled)


# Calculate metrics
metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred)
}

print(metrics)

{'accuracy': 0.5021333333333333, 'precision': 0.5, 'recall': 0.4724156400642742, 'f1_score': 0.4858165794546957}
