## import libraries

In [2]:
import numpy as np 
import pandas as pd 

from ydata_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('dataset.csv')

### *some insight and details of database*

In [4]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [5]:
df.shape

(43400, 12)

### *we must check if there exist any missing values or duplicated records*

In [6]:
df.isnull().sum()

id                       0
gender                   0
age                      0
hypertension             0
heart_disease            0
ever_married             0
work_type                0
Residence_type           0
avg_glucose_level        0
bmi                   1462
smoking_status       13292
stroke                   0
dtype: int64

In [7]:
df.duplicated().sum()

0


- We drop unused column id
- We drop the feature  *'smoking_status'*   becuase most of the records values are null 
- We will check if the data is imbalanced and balance it later 
- We replace bmi missing value  with mean, mode, or predict it by a model based on the corr bitween  *'storke'*  and  *'bmi'* <b> (after encoding and scaling)

---


In [8]:
df = df.drop(columns=['id', 'smoking_status'])

In [9]:
print('The percentage of stroke records is', df[df['stroke'] == 1].shape[0] / df.shape[0] * 100)

The percentage of stroke records is 1.804147465437788


## *encode the Ordinal and Nominal  columns*
---
- Nominal columns via onehot encoding 

In [10]:
df['work_type'].unique().tolist()

['children', 'Private', 'Never_worked', 'Self-employed', 'Govt_job']

In [11]:
categorical_columns = ['gender','work_type','Residence_type']

one_hot_encoded = pd.get_dummies(df[categorical_columns])
df = pd.concat([df.iloc[:, :-1], one_hot_encoded, df.iloc[:, -1]], axis=1)
df = df.drop(columns=categorical_columns)

- Ordinal columns via label encoding

In [12]:
label_encoder = LabelEncoder()
df['ever_married'] = label_encoder.fit_transform(df['ever_married'])

## *get profile report of processed dataset*

In [13]:
profile = ProfileReport(df, title='CVA detection')
profile.to_file('report.html')

Summarize dataset: 100%|████████████████████████████████████████████████████| 35/35 [00:04<00:00,  8.33it/s, Completed]
Generate report structure: 100%|█████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.87s/it]
Render HTML: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.70it/s]
Export report to file: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 250.02it/s]


<b>According to the distribution of avg_glucose_level and bmi, we apply the standard scaler on these two features but for age we apply minmax scaler

In [14]:
std_scaler = StandardScaler()
df[['bmi', 'avg_glucose_level']] = std_scaler.fit_transform(df[['bmi', 'avg_glucose_level']])

In [15]:
mm_scaler = MinMaxScaler()
df['age'] = mm_scaler.fit_transform(df[['age']])

<b> fill the *'bmi'* missing values by linear regression model

In [16]:
target_column = 'bmi'

top_features = df.corr()[target_column].abs().nlargest(8).index[1:]

data_for_imputation = df[top_features].copy()
data_for_imputation[target_column] = df[target_column]

train_data = data_for_imputation.dropna(subset=[target_column])
test_data = data_for_imputation[data_for_imputation[target_column].isnull()]

model = LinearRegression().fit(train_data[top_features], train_data[target_column])
df.loc[df[target_column].isnull(), target_column] = model.predict(test_data[top_features])

## *drop columns has little corr with stroke*

In [17]:
cols = ['Residence_type_Rural', 'Residence_type_Urban', 'gender_Female','gender_Male', 'gender_Other','work_type_Govt_job' , 'work_type_Never_worked', 'work_type_Private']
df = df.drop(columns=cols)

In [18]:
df

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,work_type_Self-employed,work_type_children,stroke
0,0.035645,0,0,0,-0.217176,-1.364882,0,1,0
1,0.707031,1,0,1,-0.383258,1.363586,0,0,0
2,0.096680,0,0,0,0.148621,-1.416363,0,0,0
3,0.853516,0,0,1,-0.822123,0.938871,0,0,0
4,0.169922,0,0,0,1.317458,-1.223311,0,0,0
...,...,...,...,...,...,...,...,...,...
43395,0.121094,0,0,0,-1.063359,-1.055999,0,1,0
43396,0.682617,0,0,1,2.531294,3.448547,0,0,0
43397,1.000000,1,0,1,-0.290939,0.037962,0,0,0
43398,0.487305,0,0,1,-0.123465,0.591378,0,0,0


In [19]:
X = df.drop('stroke', axis=1)
y = df['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

balancing_methods = {
    'No Balancing': None,
    'Random OverSampling': RandomOverSampler(random_state=42),
    'Random UnderSampling': RandomUnderSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42),
    'ADASYN': ADASYN(random_state=42),
    'SMOTE-ENN': SMOTEENN(random_state=42)
    
}

for method_name, balancing_method in balancing_methods.items():
    print(f"Balancing Method: {method_name}")
    
    X_train_balanced, y_train_balanced = balancing_method.fit_resample(X_train, y_train) if balancing_method else (X_train, y_train)
    
    
    svm_model = SVC(kernel='rbf', random_state=42)
    svm_model.fit(X_train_balanced, y_train_balanced)
    y_pred = svm_model.predict(X_test)
    
    # Set zero_division to 'warn' or 'ignore' to handle the warning
    report = classification_report(y_test, y_pred, zero_division=1)
    print(report)
    print('------------------------------------------------------------')

Balancing Method: No Balancing
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      4262
           1       1.00      0.00      0.00        78

    accuracy                           0.98      4340
   macro avg       0.99      0.50      0.50      4340
weighted avg       0.98      0.98      0.97      4340

------------------------------------------------------------
Balancing Method: Random OverSampling


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.svm import SVC

# Assuming df, train_test_split, and other necessary imports are done before this point

X = df.drop('stroke', axis=1)
y = df['stroke']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# Define the balancing methods
balancing_methods = {
    'No Balancing': None,
    'Random OverSampling': RandomOverSampler(random_state=42),
    'Random UnderSampling': RandomUnderSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42),
    'ADASYN': ADASYN(random_state=42),
    'SMOTE-ENN': SMOTEENN(random_state=42)
}

# Apply k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for method_name, balancing_method in balancing_methods.items():
    print(f"Balancing Method: {method_name}")

    # Initialize an empty list to store classification reports from each fold
    reports = []

    # Perform k-fold cross-validation
    for train_index, val_index in kfold.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        # Apply balancing method
        X_train_fold_balanced, y_train_fold_balanced = (
            balancing_method.fit_resample(X_train_fold, y_train_fold)
            if balancing_method
            else (X_train_fold, y_train_fold)
        )

        # Initialize and train the SVM model
        svm_model = SVC(kernel='rbf', random_state=42)
        svm_model.fit(X_train_fold_balanced, y_train_fold_balanced)

        # Make predictions on the validation set
        y_val_pred = svm_model.predict(X_val_fold)

        # Calculate and store the classification report for this fold
        report_fold = classification_report(y_val_fold, y_val_pred, zero_division=1)
        reports.append(report_fold)

    # Display the average classification report across all folds
    avg_report = "\n\nAverage Classification Report Across Folds:\n\n" + "\n\n".join(reports)
    print(avg_report)
    print('------------------------------------------------------------')
