In [29]:
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# 1. Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC())
])

# 3. Define parameter grid
param_grid = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# 4. GridSearchCV
grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)

# 5. Results
print("Best parameters found:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))


Best parameters found: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best cross-validation score: 0.96
Test set score: 1.00


Check for 3 fold, 5 fold and 7 fold cross validation

Replace classifier, SVC with RandomForestClassifier and LogisticRegression, Perceptron, knn .

Update the param_grid accordingly (e.g., for RandomForestClassifier, use n_estimators, max_depth, etc.)

Also replace Gridsearch with randomnsearch function.

Relplace with with your own csv dataset using code below:

In [30]:
from sklearn.datasets import load_iris
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter distribution
param_dist = {
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5]
}

# Try 3, 5, 7 fold CV
for cv in [3, 5, 7]:
    print(f"\n--- RandomForest (CV={cv}) ---")
    search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=5, cv=cv, random_state=42)
    search.fit(X_train, y_train)
    print("Best Params:", search.best_params_)
    print("Best CV Score: {:.2f}".format(search.best_score_))
    print("Test Accuracy: {:.2f}".format(search.score(X_test, y_test)))


--- RandomForest (CV=3) ---
Best Params: {'classifier__n_estimators': 10, 'classifier__min_samples_split': 5, 'classifier__max_depth': None}
Best CV Score: 0.91
Test Accuracy: 0.90

--- RandomForest (CV=5) ---
Best Params: {'classifier__n_estimators': 100, 'classifier__min_samples_split': 5, 'classifier__max_depth': None}
Best CV Score: 0.91
Test Accuracy: 0.93

--- RandomForest (CV=7) ---
Best Params: {'classifier__n_estimators': 100, 'classifier__min_samples_split': 5, 'classifier__max_depth': None}
Best CV Score: 0.90
Test Accuracy: 0.93


In [31]:
from sklearn.linear_model import LogisticRegression

# Reuse X_train, y_train from above
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

param_dist = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs', 'saga']
}

for cv in [3, 5, 7]:
    print(f"\n--- LogisticRegression (CV={cv}) ---")
    search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=5, cv=cv, random_state=42)
    search.fit(X_train, y_train)
    print("Best Params:", search.best_params_)
    print("Best CV Score: {:.2f}".format(search.best_score_))
    print("Test Accuracy: {:.2f}".format(search.score(X_test, y_test)))


--- LogisticRegression (CV=3) ---
Best Params: {'classifier__solver': 'saga', 'classifier__penalty': 'l2', 'classifier__C': 1}
Best CV Score: 0.91
Test Accuracy: 0.90

--- LogisticRegression (CV=5) ---
Best Params: {'classifier__solver': 'saga', 'classifier__penalty': 'l2', 'classifier__C': 1}
Best CV Score: 0.92
Test Accuracy: 0.90

--- LogisticRegression (CV=7) ---
Best Params: {'classifier__solver': 'saga', 'classifier__penalty': 'l2', 'classifier__C': 1}
Best CV Score: 0.92
Test Accuracy: 0.90


In [32]:
from sklearn.linear_model import Perceptron

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', Perceptron(random_state=42))
])

param_dist = {
    'classifier__penalty': ['l2', 'l1', None],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__max_iter': [500, 1000]
}

for cv in [3, 5, 7]:
    print(f"\n--- Perceptron (CV={cv}) ---")
    search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=5, cv=cv, random_state=42)
    search.fit(X_train, y_train)
    print("Best Params:", search.best_params_)
    print("Best CV Score: {:.2f}".format(search.best_score_))
    print("Test Accuracy: {:.2f}".format(search.score(X_test, y_test)))


--- Perceptron (CV=3) ---
Best Params: {'classifier__penalty': 'l1', 'classifier__max_iter': 500, 'classifier__alpha': 0.0001}
Best CV Score: 0.83
Test Accuracy: 0.77

--- Perceptron (CV=5) ---
Best Params: {'classifier__penalty': 'l2', 'classifier__max_iter': 500, 'classifier__alpha': 0.0001}
Best CV Score: 0.83
Test Accuracy: 0.90

--- Perceptron (CV=7) ---
Best Params: {'classifier__penalty': 'l2', 'classifier__max_iter': 500, 'classifier__alpha': 0.0001}
Best CV Score: 0.86
Test Accuracy: 0.90


In [33]:
from sklearn.neighbors import KNeighborsClassifier

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', KNeighborsClassifier())
])

param_dist = {
    'classifier__n_neighbors': [3, 5, 7],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 = Manhattan, 2 = Euclidean
}

for cv in [3, 5, 7]:
    print(f"\n--- KNN (CV={cv}) ---")
    search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=5, cv=cv, random_state=42)
    search.fit(X_train, y_train)
    print("Best Params:", search.best_params_)
    print("Best CV Score: {:.2f}".format(search.best_score_))
    print("Test Accuracy: {:.2f}".format(search.score(X_test, y_test)))


--- KNN (CV=3) ---
Best Params: {'classifier__weights': 'uniform', 'classifier__p': 2, 'classifier__n_neighbors': 7}
Best CV Score: 0.93
Test Accuracy: 0.87

--- KNN (CV=5) ---
Best Params: {'classifier__weights': 'uniform', 'classifier__p': 2, 'classifier__n_neighbors': 7}
Best CV Score: 0.93
Test Accuracy: 0.87

--- KNN (CV=7) ---
Best Params: {'classifier__weights': 'uniform', 'classifier__p': 2, 'classifier__n_neighbors': 7}
Best CV Score: 0.93
Test Accuracy: 0.87


In [34]:
import pandas as pd
from google.colab import files
import io

# Step 1: Upload the file (you'll be prompted to choose a file)
uploaded = files.upload()

# Step 2: Extract the filename (this assumes you uploaded one file)
filename = list(uploaded.keys())[0]

# Step 3: Read the CSV into a DataFrame
data = pd.read_csv(io.BytesIO(uploaded[filename]))

# Step 4: Split into features (X) and target (y)
X = data.drop("stroke", axis=1)
y = data["stroke"]

Saving healthcare-dataset-stroke-data.csv to healthcare-dataset-stroke-data (4).csv


In [35]:
X

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked
...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked


In [36]:
y

Unnamed: 0,stroke
0,1
1,1
2,1
3,1
4,1
...,...
5105,0
5106,0
5107,0
5108,0


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [38]:
data.isnull().sum()

Unnamed: 0,0
id,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,201


In [39]:
average_bmi_male = data[data['gender'] == 'Male']['bmi'].mean()
average_bmi_female = data[data['gender'] == 'Female']['bmi'].mean()

In [40]:
average_bmi_male

np.float64(28.64793635007459)

In [41]:
average_bmi_female

np.float64(29.065757680358992)

In [42]:
data['bmi'] = data.apply(
    lambda row: average_bmi_male if row['gender'] == 'Male' and pd.isna(row['bmi']) else row['bmi'], axis=1)

In [43]:
data['bmi'] = data.apply(
    lambda row: average_bmi_female if row['gender'] == 'Female' and pd.isna(row['bmi']) else row['bmi'], axis=1)

In [44]:
data.isnull().sum()

Unnamed: 0,0
id,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,0


In [45]:
data['bmi']
X
print(X.shape,y.shape)

(5110, 11) (5110,)


In [46]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Load your dataset
#df = pd.read_csv("healthcare-dataset-stroke-data.csv")
#df = df.drop(columns=['id'])  # drop id

uploaded = files.upload()

# Step 2: Extract the filename (this assumes you uploaded one file)
filename = list(uploaded.keys())[0]

# Step 3: Read the CSV into a DataFrame
df = pd.read_csv(io.BytesIO(uploaded[filename]))
df = df.drop(columns=['id'])  # drop id

# Define feature groups
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
numerical_cols = ['age', 'avg_glucose_level', 'bmi', 'hypertension', 'heart_disease']

X = df.drop(columns=['stroke'])

# Preprocessing pipelines
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('encoder', OneHotEncoder(drop='first', sparse_output=False))
])

# Column transformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Fit and transform
X_encoded = preprocessor.fit_transform(X)

# Get feature names
encoded_cat_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)
all_feature_names = numerical_cols + list(encoded_cat_names)

# View transformed DataFrame
encoded_df = pd.DataFrame(X_encoded, columns=all_feature_names)
print(encoded_df.head())




Saving healthcare-dataset-stroke-data.csv to healthcare-dataset-stroke-data (5).csv
        age  avg_glucose_level           bmi  hypertension  heart_disease  \
0  1.051434           2.706375  1.001234e+00     -0.328602       4.185032   
1  0.786070           2.121559  4.615554e-16     -0.328602      -0.238947   
2  1.626390          -0.005028  4.685773e-01     -0.328602       4.185032   
3  0.255342           1.437358  7.154182e-01     -0.328602      -0.238947   
4  1.582163           1.501184 -6.357112e-01      3.043196      -0.238947   

   gender_Male  gender_Other  ever_married_Yes  work_type_Never_worked  \
0          1.0           0.0               1.0                     0.0   
1          0.0           0.0               1.0                     0.0   
2          1.0           0.0               1.0                     0.0   
3          0.0           0.0               1.0                     0.0   
4          0.0           0.0               1.0                     0.0   

   work_

In [20]:
X_encoded.shape

(5110, 16)

In [53]:

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
# Pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter distribution
param_dist = {
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5]
}

# Try 3, 5, 7 fold CV
for cv in [3, 5, 7]:
    print(f"\n--- RandomForest (CV={cv}) ---")
    search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=5, cv=cv, random_state=42)
    search.fit(X_train, y_train)
    print("Best Params:", search.best_params_)
    print("Best CV Score: {:.2f}".format(search.best_score_))
    print("Test Accuracy: {:.2f}".format(search.score(X_test, y_test)))


--- RandomForest (CV=3) ---
Best Params: {'classifier__n_estimators': 100, 'classifier__min_samples_split': 2, 'classifier__max_depth': 5}
Best CV Score: 0.95
Test Accuracy: 0.94

--- RandomForest (CV=5) ---
Best Params: {'classifier__n_estimators': 100, 'classifier__min_samples_split': 2, 'classifier__max_depth': 5}
Best CV Score: 0.95
Test Accuracy: 0.94

--- RandomForest (CV=7) ---
Best Params: {'classifier__n_estimators': 100, 'classifier__min_samples_split': 2, 'classifier__max_depth': 5}
Best CV Score: 0.95
Test Accuracy: 0.94


In [48]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

param_dist = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs', 'saga']
}

for cv in [3, 5, 7]:
    print(f"\n--- LogisticRegression (CV={cv}) ---")
    search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=5, cv=cv, random_state=42)
    search.fit(X_train, y_train)
    print("Best Params:", search.best_params_)
    print("Best CV Score: {:.2f}".format(search.best_score_))
    print("Test Accuracy: {:.2f}".format(search.score(X_test, y_test)))


--- LogisticRegression (CV=3) ---
Best Params: {'classifier__solver': 'saga', 'classifier__penalty': 'l2', 'classifier__C': 0.01}
Best CV Score: 0.95
Test Accuracy: 0.94

--- LogisticRegression (CV=5) ---
Best Params: {'classifier__solver': 'saga', 'classifier__penalty': 'l2', 'classifier__C': 0.01}
Best CV Score: 0.95
Test Accuracy: 0.94

--- LogisticRegression (CV=7) ---
Best Params: {'classifier__solver': 'saga', 'classifier__penalty': 'l2', 'classifier__C': 0.01}
Best CV Score: 0.95
Test Accuracy: 0.94


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', Perceptron(random_state=42))
])

param_dist = {
    'classifier__penalty': ['l2', 'l1', None],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__max_iter': [500, 1000]
}

for cv in [3, 5, 7]:
    print(f"\n--- Perceptron (CV={cv}) ---")
    search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=5, cv=cv, random_state=42)
    search.fit(X_train, y_train)
    print("Best Params:", search.best_params_)
    print("Best CV Score: {:.2f}".format(search.best_score_))
    print("Test Accuracy: {:.2f}".format(search.score(X_test, y_test)))


--- Perceptron (CV=3) ---
Best Params: {'classifier__penalty': 'l1', 'classifier__max_iter': 500, 'classifier__alpha': 0.0001}
Best CV Score: 0.91
Test Accuracy: 0.94

--- Perceptron (CV=5) ---
Best Params: {'classifier__penalty': 'l1', 'classifier__max_iter': 500, 'classifier__alpha': 0.0001}
Best CV Score: 0.95
Test Accuracy: 0.94

--- Perceptron (CV=7) ---
Best Params: {'classifier__penalty': None, 'classifier__max_iter': 500, 'classifier__alpha': 0.001}
Best CV Score: 0.95
Test Accuracy: 0.94


In [50]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', KNeighborsClassifier())
])

param_dist = {
    'classifier__n_neighbors': [3, 5, 7],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 = Manhattan, 2 = Euclidean
}

for cv in [3, 5, 7]:
    print(f"\n--- KNN (CV={cv}) ---")
    search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=5, cv=cv, random_state=42)
    search.fit(X_train, y_train)
    print("Best Params:", search.best_params_)
    print("Best CV Score: {:.2f}".format(search.best_score_))
    print("Test Accuracy: {:.2f}".format(search.score(X_test, y_test)))


--- KNN (CV=3) ---
Best Params: {'classifier__weights': 'uniform', 'classifier__p': 1, 'classifier__n_neighbors': 7}
Best CV Score: 0.95
Test Accuracy: 0.94

--- KNN (CV=5) ---
Best Params: {'classifier__weights': 'uniform', 'classifier__p': 1, 'classifier__n_neighbors': 7}
Best CV Score: 0.95
Test Accuracy: 0.94

--- KNN (CV=7) ---
Best Params: {'classifier__weights': 'uniform', 'classifier__p': 1, 'classifier__n_neighbors': 7}
Best CV Score: 0.95
Test Accuracy: 0.94
