In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [144]:
df = pd.read_csv('Preprocessing_V1.csv')

In [145]:
df.shape

(4000, 11)

What is the total number of missing or unknown values in the column Gender?

In [146]:
df['Gender'].isna().sum()


np.int64(0)

In [147]:
df['Gender'].value_counts(dropna=False)


Gender
Female     2366
Male       1627
Unknown       7
Name: count, dtype: int64

In [148]:
unknown_count = df['Gender'].isin(['Unknown', 'unknown', '?']).sum()
unknown_count


np.int64(7)

What is the total number of missing or unknown values in the column GlucoseLevel?

In [149]:
print(df['GlucoseLevel'].unique())

[ 54.6 108.8  64.1 ... 218.9  66.3 168.5]


In [150]:
df['GlucoseLevel'].isna().sum()


np.int64(0)

In [151]:
missing_or_unknown = df['GlucoseLevel'].replace(
    ['Unknown', 'unknown', '?', '', 'NA', 'N/A', 'na', 'null', 'NULL'], 
    np.nan
).isna().sum()

print("Total missing or unknown GlucoseLevel values:", missing_or_unknown)

Total missing or unknown GlucoseLevel values: 0


What is the total number of missing or unknown values in the column LivesIn?

In [152]:
print(df['LivesIn'].unique())

['City' 'Village' 'Unknown']


In [153]:
missing_or_unknown = df['LivesIn'].replace(
    ['Unknown'], 
    np.nan
).isna().sum()

print("Total missing or unknown ", missing_or_unknown)

Total missing or unknown  5


What is the total number of missing or unknown values in the column BMI?

In [154]:
print(df['BMI'].unique())

[35.1 26.7 23.4 27.4 41.6 29.3 37.1 16.1 40.5 15.8 29.9 29.5 40.7 36.6
 31.5  nan 12.1 25.5 22.3 27.1 44.7 26.1 18.8 18.7 24.8 17.  37.6 20.6
 29.  56.2 30.7 25.3 23.  27.2 19.2 31.6 24.6 27.  24.5 18.2 52.  32.3
 42.7 30.  24.3 24.2 25.7 36.7 46.4 48.3 20.9 24.7 23.6 26.5 39.4 18.4
 25.6 25.9 54.6 31.9 14.6 38.7 23.7 27.3 29.2 39.7 30.1 28.1 35.7 14.3
 30.4 22.2 35.  44.5 36.3 25.2 26.6 31.4 36.8 25.8 38.4 43.2 20.4 30.6
 33.8 34.  26.2 29.6 30.2 22.9 38.9 16.3 23.3 25.1 34.1 45.7 37.3 26.4
 40.9 31.1 17.7 27.5 19.9 32.  35.9 32.1 24.9 23.8 18.  20.7 27.7 22.6
 13.1 19.4 28.5 28.8 21.7 19.6 27.8 41.  41.8 35.2 44.4 42.6 15.7 52.8
 23.1 38.5 22.7 18.3 42.3 43.4 51.5 24.  28.7 23.9 37.9 32.6 35.6 34.7
 28.3 33.2 32.5 44.1 34.2 22.  33.3 16.8 35.4 20.1 26.3 37.5 33.1 21.2
 33.  33.6 30.3 26.  34.8 31.8 42.4 25.4 23.2 19.3 27.9 36.1 43.  16.9
 20.5 33.9 28.2 24.1 41.1 32.2 26.8 30.8 22.5 29.7 40.  34.5 28.  37.7
 19.8 28.4 34.3 20.8 16.2 17.5 36.9 19.5 31.7 34.4 29.1 39.3 35.5 21.
 31.  2

In [155]:
unknown = df['BMI'].replace(['nan'], np.nan).isna().sum()
print(unknown)

149


In [156]:
df['BMI'].isna().sum()

np.int64(149)

What is the total number of missing or unknown values in the column SmokingStatus?

In [157]:
print(df['SmokingStatus'].unique())

['never smoked' 'smokes' 'Unknown' 'formerly smoked']


In [158]:
value = df['SmokingStatus'].replace(['Unknown'], np.nan).isna().sum()
print(value)

1204


What is the mean value of the BMI in the dataset? Ignore the missing values if any.

In [159]:
df['BMI'].mean()

np.float64(28.857958971695663)

How many people live in city, smoked at least once in life and had a heartattack? Ignore records/rows with any missing values.

In [160]:
print(df['SmokingStatus'].unique())
print(df['HeartAttack'].unique())
print(df['LivesIn'].unique())

['never smoked' 'smokes' 'Unknown' 'formerly smoked']
['No' 'Yes']
['City' 'Village' 'Unknown']


In [161]:
cond = df[(df['LivesIn'] == 'City')&(df['HeartAttack'] == 'Yes')&(df['SmokingStatus'].isin(['smokes', 'formerly smoked']))]
count = cond.shape[0]
print(count)

52


In [162]:
df_clean = df.dropna()

# A: female, no tension, no heart disease, never married
A = df_clean[
    (df_clean["Gender"] == "Female") &
    (df_clean["HasTension"] == "No") &
    (df_clean["HeartAttack"] == "No") &
    (df_clean["NeverMarried"] == "Yes")
].shape[0]

# B: female, no tension, no heart disease, married or previously married
B = df_clean[
    (df_clean["Gender"] == "Female") &
    (df_clean["HasTension"] == "No") &
    (df_clean["HeartAttack"] == "No") &
    (df_clean["NeverMarried"] == "No")
].shape[0]

# C: male, no tension, no heart disease, never married
C = df_clean[
    (df_clean["Gender"] == "Male") &
    (df_clean["HasTension"] == "No") &
    (df_clean["HeartAttack"] == "No") &
    (df_clean["NeverMarried"] == "Yes")
].shape[0]

# D: male, tension = yes, heart attack = yes, never married
D = df_clean[
    (df_clean["Gender"] == "Male") &
    (df_clean["HasTension"] == "Yes") &
    (df_clean["HeartAttack"] == "Yes") &
    (df_clean["NeverMarried"] == "Yes")
].shape[0]

A, B, C, D


(1295, 732, 799, 18)

'HeartAttack' is the target column. What is the distribution count of "No" and "Yes" classes?


In [163]:
df.HeartAttack.value_counts()

HeartAttack
No     3806
Yes     194
Name: count, dtype: int64

Divide the data into training and test sets
Keep 30% of the data as test set.

Use random_state as 0

HeartAttack is the target, rest of the columns are the features.

For the label/target vector, replace "Yes" with 1 and "No" with 0.

Divide the dataset into training and test sets keeping target(y) in stratified manner.

In [164]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
df['HeartAttack'] = df.HeartAttack.map({'Yes': 1, 'No': 0})
X = df.drop('HeartAttack', axis=1)
y = df['HeartAttack']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) 

In [165]:
X_train.shape

(2800, 10)

Prepare a data preprocessing pipeline to process features in following order:
Gender: Impute with most frequent then ordinally encode.
Age: Impute with mean then standard scale.
HasTension: Ordinally encode.
AnyHeartDisease:Ordinally encode.
NeverMarried:Ordinally encode.
Occupation: One hot encode.
LivesIn: Impute with most frequent then ordinally encode.
GlucoseLevel: Impute with mean, then min-max scaling.
BMI: Impute with mean, then standard scale.
SmokingStatus: Impute with most frequent, then one hot encode.
Hint: After transformation, your feature matrix must have columns in following order:

Gender
Age
HasTension
AnyHeartDisease
NeverMarried
Occupation_Govt_job
Occupation_Never_worked
Occupation_Private
Occupation_Self-employed
Occupation_children
LivesIn
GlucoseLevel
BMI
SmokingStatus_formerly smoked
SmokingStatus_never smoked
SmokingStatus_smokes
NOTE:

Make sure to preprocess the features in the above order exactly. Answer(s) of later question(s) depend(s) upon correct order of featuring processing.
You may have to use multiple instances of a trasnformer for this question.

In [166]:
X_train_modified = X_train.copy()
X_test_modified = X_test.copy()
X_train_modified['SmokingStatus'] = X_train_modified['SmokingStatus'].replace('Unknown', np.nan)
X_test_modified['SmokingStatus'] = X_test_modified['SmokingStatus'].replace('Unknown', np.nan)

In [167]:
# --- Define Individual Preprocessing Steps ---

# 1. Gender: Impute (most frequent) then Ordinal Encode
gender_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# 2. Age: Impute (mean) then Standard Scale
age_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# 3, 4, 5: HasTension, AnyHeartDisease, NeverMarried: Ordinal Encode
tension_pipe = OrdinalEncoder()
heart_pipe = OrdinalEncoder()
married_pipe = OrdinalEncoder()

# 6. Occupation: One hot encode (handle_unknown='ignore' to avoid errors on new categories)
occupation_pipe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# 7. LivesIn: Impute (most frequent) then Ordinal Encode
livesin_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# 8. GlucoseLevel: Impute (mean) then Min-Max Scale
glucose_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

# 9. BMI: Impute (mean) then Standard Scale
bmi_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# 10. SmokingStatus: Impute (most frequent) then One hot encode
smoking_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# --- Define the ColumnTransformer ---
# The order of the features below dictates the order of columns in the final matrix.
preprocessor = ColumnTransformer(
    transformers=[
        ('gender_pipe', gender_pipe, ['Gender']),
        ('age_pipe', age_pipe, ['Age']),
        ('tension_pipe', tension_pipe, ['HasTension']),
        ('heart_pipe', heart_pipe, ['AnyHeartDisease']),
        ('married_pipe', married_pipe, ['NeverMarried']),
        ('occupation_pipe', occupation_pipe, ['Occupation']),
        ('livesin_pipe', livesin_pipe, ['LivesIn']),
        ('glucose_pipe', glucose_pipe, ['GlucoseLevel']),
        ('bmi_pipe', bmi_pipe, ['BMI']),
        ('smoking_pipe', smoking_pipe, ['SmokingStatus'])
    ],
    remainder='drop' # Ensures no unintended columns are passed through
)

# Fit and transform the training data
X_train_processed = preprocessor.fit_transform(X_train_modified)

# The shape of the processed training feature matrix is (2800, 16).

Calculate the shape of the feature matrix of training dataset.

In [168]:
X_train_processed.shape

(2800, 16)

What is the mean of the transformed test data (features only)?

In [169]:
# Transform the test set using the *already fitted* preprocessor
X_test_processed = preprocessor.transform(X_test)

# Compute the mean of ALL values in the transformed test matrix
mean_test = X_test_processed.mean()

print("Mean of transformed test data:", mean_test)


Mean of transformed test data: 0.26833482647759943


If you eliminate 1 feature with recursive feature elimination, which feature will be eliminated?
Type the index of the eliminated feature (index starts from 0).
Use LogisticRegression model with random state as 1729 and rest of the parameters with default values, as an estimator.
Use processed training data.

In [170]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_selection import RFE



In [171]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Logistic Regression model
model = LogisticRegression(random_state=1729)

# RFE: eliminate 1 feature
rfe = RFE(estimator=model, n_features_to_select=X_train.shape[1] - 1)

# Fit RFE
rfe.fit(X_train_processed, y_train)

# Get eliminated feature index (the one marked False)
eliminated_feature_index = list(rfe.support_)

print("Eliminated feature index:", eliminated_feature_index)


Eliminated feature index: [np.True_, np.True_, np.True_, np.True_, np.True_, np.True_, np.False_, np.True_, np.False_, np.False_, np.False_, np.True_, np.False_, np.False_, np.True_, np.False_]


In [172]:
df = pd.read_csv('ModelBuilding_V1.csv')

Load the dataset.

The last column is the target column.
Last 30% rows of the dataset constitute test set and remaining rows form the training set.
Do not shuffle the dataset while splitting
You must have to use only training set to train all the estimator in questions below.
First row of the file has column names/ids, and it has no index column.

Instantiate a perceptron classifier that with following parameters:

random_state = 1729
learning rate = 1
Train for appropriate number of iterations
Do not shuffle the dataset for each iteration.
Include the intercept (bias) term.
Use 10% of the data for validation fraction.
Do not apply regularization.
Set warm start to true.
Hint: one iteration of training indicates going over each sample exactly once.

Train the classifier on the training data.

Q.Train the perceptron classifier for 5 iterations. What is value of bias (intercept) after 5th iteration?

In [173]:
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
X = df.drop('HeartAttack', axis=1)
y = df['HeartAttack']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [174]:
clf = Perceptron(random_state=1729, eta0=1.0, shuffle=False, fit_intercept=True,  validation_fraction=0.1, penalty=None, warm_start=True, max_iter=5)

In [175]:
clf.fit(X_train.values, y_train)



0,1,2
,penalty,
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,5
,tol,0.001
,shuffle,False
,verbose,0
,eta0,1.0
,n_jobs,


In [176]:
clf.intercept_[0]

np.float64(-3.0)

In continuation of the previous question, compute precision accurate upto 2 decimal places on training data for positive class (i.e. class value 1), after 5 iterations.

In [177]:
from sklearn.metrics import precision_score

In [178]:
y_pred=clf.predict(X_train.values)

In [179]:
precision_score(y_train, y_pred, pos_label=1)

0.16938775510204082

Train (on training data only) logistic regression using SGDClassifier. Use the following parameters:
Choose appropriate loss value to obtain logistic regression
penalty='l2',
eta0=0.001,
alpha=0,
learning_rate='constant'
random_state=1729.
warm_start = True
Train the classifier for 5 iterations and note the value of the loss in each iteration. What will be the loss value after second iteration? Answer upto three decimal places.

Note: Set the remaining parameters, if any, accordingly to be able to get the loss value after second iteration. Also note that the classifier has to be trained for 5 iterations.

In [180]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss

In [181]:
clf = SGDClassifier(loss="log_loss", penalty='l2', eta0=0.001, alpha=0, learning_rate="constant", random_state=1729, warm_start=True)

In [184]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss
clf = SGDClassifier(loss="log_loss", max_iter=1, penalty='l2', eta0=0.001, alpha=0, learning_rate="constant", random_state=1729, warm_start=True)
losses = []
for i in range(5):  
    clf.fit(X_train, y_train)
    y_pred=clf.predict_proba(X_train)                                                                                                                                                                                       
    current_loss = log_loss(y_train, y_pred)
    losses.append(current_loss)



In [185]:
losses

[0.25872807031502587,
 0.21317531419837446,
 0.19828965864862694,
 0.190693740691189,
 0.18583079635693078]

Use GridSearchCV with SGDClassifier. Following are the classifier's parameters:
loss = 'log_loss'
learning_rate = 'constant'
random_state = 1729
Following are parameters to examine:

alpha = [0.0001, 0.0005, 0.001, 0.005]
eta0 = [0.01, 0.05, 0.1, 0.5]
What are the best values of alpha and eta0 respectively?

In [186]:
from sklearn.model_selection import GridSearchCV, train_test_split

In [197]:
sgd = SGDClassifier(loss="log_loss", learning_rate="constant", random_state=1729)
param_grid = {
    'alpha': [0.0001, 0.0005, 0.001, 0.005],
    'eta0': [0.01, 0.05, 0.1, 0.5]
}
grid = GridSearchCV(sgd, param_grid,n_jobs=-1)
grid.fit(X_train, y_train)

0,1,2
,estimator,SGDClassifier...om_state=1729)
,param_grid,"{'alpha': [0.0001, 0.0005, ...], 'eta0': [0.01, 0.05, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'log_loss'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [198]:
grid.best_params_

{'alpha': 0.0001, 'eta0': 0.05}

Create a new estimator using SGDClassifier that uses the best parameters obtained in Gridsearch earlier ((learning rate to be constant, random_state to be '1729' and use appropriate loss for logistic regression)) and set the weight of class 0 to be 0.1 and the weight of class 1 to be 2. How many samples of class 1 from the test set are correctly predicted by this estimator?

In [199]:
sgdnew = SGDClassifier(loss="log_loss", learning_rate="constant", random_state=1729, alpha=0.0001, eta0=0.01, class_weight={0:0.1, 1:2} )


In [200]:
sgdnew.fit(X_train, y_train)

0,1,2
,loss,'log_loss'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [202]:
y_pred=sgdnew.predict(X_test)

In [203]:
correct_class1 = np.sum((y_test == 1) & (y_pred == 1))
print(correct_class1)

36


Fit an SVM classifier with following parameters:
kernel='rbf'
decision_function_shape='ovr'
random_state=1729
C=1
Train the model on training data, and print the confusion matrix on test data.

In [204]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [205]:
svm = SVC(kernel='rbf', decision_function_shape='ovr', random_state=1729, C=1)
svm.fit(X_train, y_train)
y_pred=svm.predict(X_test)
cm=confusion_matrix(y_test, y_pred)
print(cm)

[[1149    0]
 [  51    0]]


Instructions for Q8-10
Train a Decision Tree Classifier with the following properties:

criterion = 'entropy',
splitter = 'random',
min_samples_split = 4,
min_impurity_decrease = 0.0001,
random_state = 1729

Q.8 [MCQ][5 Marks] What is the resultant depth of the tree?
Q.9 [MCQ][Marks 5] How many nodes are there in the tree?
Q.10 [NAT][5 Marks] What is the value of entropy at the left child of root?

In [206]:
from sklearn.tree import DecisionTreeClassifier


In [207]:
dt=DecisionTreeClassifier(criterion='entropy', splitter='random', min_samples_split=4, min_impurity_decrease=0.0001, random_state=1729)
dt.fit(X_train, y_train)


0,1,2
,criterion,'entropy'
,splitter,'random'
,max_depth,
,min_samples_split,4
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1729
,max_leaf_nodes,
,min_impurity_decrease,0.0001


In [208]:
depth=dt.get_depth()
print(depth)

19


In [209]:
node = dt.tree_.node_count
print(node)

537


Out of DecisionTreeClassifier, KNeighborsClassifier and LogisticRegression, which one performs the best when used as base estimator in BaggingClassifier on the test data in terms of accuracy score when 20 base estimators are used ?
(Use random state 1729 for BaggingClassifier, DecisionTreeClassifier and LogisticRegression)

The metric for best performance will be the lowest 'absolute' difference in the train and test score.

In [210]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [212]:
dt = DecisionTreeClassifier(random_state=1729)
bag1 = BaggingClassifier(dt, n_estimators=20, random_state=1729)
bag1.fit(X_train, y_train)
train_dt = bag1.score(X_train, y_train)
test_dt = bag1.score(X_test, y_test)
result = abs(train_dt-test_dt)
print(result)

0.042619047619047556


In [214]:
knn = KNeighborsClassifier()
bag2 = BaggingClassifier(knn, n_estimators=20, random_state=1729)
bag2.fit(X_train, y_train)
train_knn = bag2.score(X_train, y_train)
test_knn = bag2.score(X_test, y_test)
result2 = abs(train_knn-test_knn)
print(result2)

0.001547619047619131


In [215]:
lr = LogisticRegression(random_state=1729)
bag3 = BaggingClassifier(lr, n_estimators=20, random_state=1729)
bag3.fit(X_train, y_train)
train_lr = bag3.score(X_train, y_train)
test_lr = bag3.score(X_test, y_test)
result3 = abs(train_lr-test_lr)
print(result3)

0.009404761904761916


When the above three individual classifiers (with same settings) are used in VotingClassifier, how much absolute difference do we obtain in train and test scores? Enter your answer correct upto 4 decimal places.

In [216]:
from sklearn.ensemble import VotingClassifier

In [222]:
dt = DecisionTreeClassifier(random_state=1729)
knn = KNeighborsClassifier()
lr = LogisticRegression(random_state=1729, max_iter=5000)
vclf = VotingClassifier(
    estimators=[
        ('dt', dt),
        ('knn', knn),
        ('lr', lr)
    ]
)

# Train
vclf.fit(X_train, y_train)

# Scores
train_score = vclf.score(X_train, y_train)
test_score = vclf.score(X_test, y_test)

# Absolute difference
abs_diff = abs(train_score - test_score)
print(abs_diff)

0.002619047619047632
