# 1.Importing all the required Libraries 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# preprocessing
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
import warnings
warnings.filterwarnings('ignore')

# 2.Loading the Dataset

In [3]:
df=pd.read_csv('/Users/ankitrajsingh/Desktop/Dissertation/heart.csv')
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


# 3.Understanding the data

In [4]:
# Retrieve all labels and store in a list
columns_df = list(df.columns.values)

# Iterate over the list to print all unique values of each column in the DataFrame
for column in columns_df:
    print("Column Name:", column)
    print("Unique Values:", df[column].unique())
    print("-----------------------")
    

Column Name: HeartDisease
Unique Values: ['No' 'Yes']
-----------------------
Column Name: BMI
Unique Values: [16.6  20.34 26.58 ... 62.42 51.46 46.56]
-----------------------
Column Name: Smoking
Unique Values: ['Yes' 'No']
-----------------------
Column Name: AlcoholDrinking
Unique Values: ['No' 'Yes']
-----------------------
Column Name: Stroke
Unique Values: ['No' 'Yes']
-----------------------
Column Name: PhysicalHealth
Unique Values: [ 3.  0. 20. 28.  6. 15.  5. 30.  7.  1.  2. 21.  4. 10. 14. 18.  8. 25.
 16. 29. 27. 17. 24. 12. 23. 26. 22. 19.  9. 13. 11.]
-----------------------
Column Name: MentalHealth
Unique Values: [30.  0.  2.  5. 15.  8.  4.  3. 10. 14. 20.  1.  7. 24.  9. 28. 16. 12.
  6. 25. 17. 18. 21. 29. 22. 13. 23. 27. 26. 11. 19.]
-----------------------
Column Name: DiffWalking
Unique Values: ['No' 'Yes']
-----------------------
Column Name: Sex
Unique Values: ['Female' 'Male']
-----------------------
Column Name: AgeCategory
Unique Values: ['55-59' '80 or older

In [5]:
duplicate_sum = df.duplicated().sum()
print(duplicate_sum)

18078


In [None]:
df.drop_duplicates(inplace=True)

In [None]:
# list of numerical features
numeric_features = df.select_dtypes(include=[np.number])
numeric_features.columns

In [None]:
# list of categorical features
categorical_features = df.select_dtypes(include=[object])
categorical_features.columns

## 3.1 Descriptive Statistic

In [None]:
# Generate descriptive statistics
df.describe()[1:][list(numeric_features)].T.style.background_gradient(cmap='Blues')

Looking at the data, we see that the average BMI is about 28.33, but it can be as low as 12.02 or as high as 94.85. People's BMIs vary quite a bit. On average, the score for PhysicalHealth is about 3.37 out of 30, but this varies a lot too. MentalHealth scores, like PhysicalHealth, have a lot of variation with an average score close to 3.90 out of 30. Lastly, people usually sleep about 7.1 hours, but this can be as short as 1 hour or as long as a full day."

# 4.EDA

## 4.1 Univariate Analysis

The purpose of univariate analysis to examine the details of each feature in the dataset and find the patterns, which will be valuable form further analysis of the dataset.


In [None]:
import plotly.express as px

# Counting the values for HeartDisease
heart_disease_counts = df['HeartDisease'].value_counts()

# Creating an interactive pie chart using plotly
fig = px.pie(heart_disease_counts, values=heart_disease_counts.values, 
             names=['No HeartDisease', 'HeartDisease'], 
             title='Distribution of HeartDisease', 
             color_discrete_sequence=['green', 'red']) # Updated colors here

# Adding percentage and actual count to the pie chart
fig.update_traces(textinfo='percent+label+value')

fig.show()


We can see that the number of people with and without "HeartDisease" isn't even.

In [None]:
# Variables to plot
variables = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']

# Initialize the figure
fig, axes = plt.subplots(len(variables), 1, figsize=(10, 5 * len(variables)))

# Generate the plots
for var, ax in zip(variables, axes):
    for label in df['HeartDisease'].unique():
        sns.kdeplot(df[df['HeartDisease'] == label][var], ax=ax, shade=True, label=label)
    ax.set_title(f'KDE for {var}')
    ax.set_xlim(df[var].min(), df[var].max())
    ax.legend(title='HeartDisease')

plt.tight_layout()
plt.show()


From the KDE plots, we can infer that people with heart disease tend to have higher BMI values, suggesting that being heavier might increase heart disease risk. For PhysicalHealth and MentalHealth, it's hard to tell any difference between people with and without heart disease, meaning these factors might not tell us much about heart disease risk. However, when it comes to sleep, people with heart disease seem to sleep less, hinting that not getting enough sleep might be linked to a higher chance of having heart disease.

In [None]:
# Correlation matrix to see relationships between variables
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

The heatmap between all pairs of numerical features shows that the correlation between 'HeartDisease' and other features is relatively low,pointing that no single numerical feature strongly predicts heart disease on its own.

In [None]:
# Importing required libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Visualizing the categorical features using count plots
plt.figure(figsize=(25, 15))
i = 1

for feature in categorical_features:
    plt.subplot(3, 5, i)
    sns.set(palette='Paired')
    sns.set_style("ticks")
    ax = sns.countplot(x=feature, data=df)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
    i += 1

plt.tight_layout()
plt.show()


The graph primarily focuses on heart disease, with an imbalanced distribution between those with and without the heart disease. Many people don't smoke or drink alcohol, and most haven't experienced a stroke, asthma, kidney disease, or skin cancer. There's an even split between male and female participants. Also, the plots depicts a variety of age groups and races, with one race(white) being more dominant. While most are not diabetic and engage in physical activities, a few have difficulties walking. Their general health and sleep patterns vary, with most having typical 7 hours sleep durations.

## 4.2 Bivariate visualization

In [None]:
import plotly.express as px

def categorical_feature_func():
    figs = []
    for feature in categorical_features:
        fig = px.histogram(df, x=feature, color="HeartDisease", title=feature, barmode="group")
        figs.append(fig)
    return figs

# To display the plots:
for fig in categorical_feature_func():
    fig.show()


The plots show how different groups of people, based on things like smoking or age, have heart disease. Both smokers and people who don't smoke have more people without heart disease, but surprisingly, people who don't smoke seem to have a slightly higher chance of heart disease. This is similar for people who drink alcohol and those who don't. People who've had a stroke are more likely to have heart disease. Men seem to have a slightly higher chance of heart disease than women. Older people are more likely to have heart disease. Different races have slightly different chances of heart disease. People with diabetes or who are less physically active, or in poorer general health, seem to have a higher chance of heart disease. Overall, in every group, more people don't have heart disease than those who do.

# 5.Feature Engineering

Here we will create new features from the existing ones or modifiy the current features as per the requirement of analysis.


## 5.1 Label Encoding

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error,accuracy_score

In [None]:
from sklearn.preprocessing import LabelEncoder


In [None]:
le=LabelEncoder()

In [None]:
list=['HeartDisease','Smoking', 'AlcoholDrinking', 'Stroke','DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']
for i in list:
    df[i]=le.fit_transform(df[i])

In [None]:
# iterate over the list to print all unique values of each column in the dataframe
for column in columns_df:
    print("Column Name:", column)
    print("Unique Values:", df[column].unique())
    print("-----------------------")

## 5.2 Checking for missing value

In [None]:
df.isnull().sum()

There are no missing values in the dataframe

## 5.3 Checking and Handling Outliers

In [None]:
# Plotting boxplots for each feature to identify outliers
def numeric_features_func():
    i = 1
    plt.figure(figsize=(35, 5))
    
    for feature in numeric_features.columns:
        plt.subplot(1, 5, i)
        sns.set(palette='dark')
        sns.set_style("ticks")
        sns.boxplot(df[feature])
        plt.xlabel(feature)
        plt.ylabel("Value")
        i += 1
    plt.show()

numeric_features_func()

In [None]:
# Counting the outliers for each numeric variable using the IQR method

outliers_count = {}

for col in numeric_features.columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outliers_count[col] = len(outliers)

outliers_count



PhysicalHealth and  MentalHealth represents health conditions or habits.so, we don't need to address outliers.whereas for BMI and SleepTime the outliers need to be addressed.
we'll use the capping method as it's straightforward and doesn't result in a loss of data:


In [None]:
# Define a function to cap the outliers
def cap_outliers(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    data[col] = np.where(data[col] < lower_bound, lower_bound, data[col])
    data[col] = np.where(data[col] > upper_bound, upper_bound, data[col])
    return data

# Treat outliers for BMI and SleepTime
df = cap_outliers(df, 'BMI')
df = cap_outliers(df, 'SleepTime')

# Display updated statistics for BMI and SleepTime
df[['BMI', 'SleepTime']].describe()

Now, the values now fall within a range that excludes the previously identified outliers.

## 5.4 Splitting the Data into Features and Target Variable

In [None]:
x=df.drop(columns=['HeartDisease'])
y=df['HeartDisease']

In [None]:
x.sample(10)

## 5.5 Handling Imbalanced Dataset

In [None]:
from imblearn.under_sampling import NearMiss
from collections import Counter

In [None]:
NearMiss_obj = NearMiss()
new_x , new_y = NearMiss_obj.fit_resample(x,y)
print('Original: {}'.format(Counter(y))) 
print('NearMiss: {}'.format(Counter(new_y))) 

Originally, our dataset had a lot more "No" samples (292,422) compared to "Yes" samples (27,373).we used Nearmiss to even the target variable. It did by trimming down the larger group ("No") to match the size of the smaller group ("Yes"). So after applying NearMiss, we ended up with 27,373 samples in both groups.

In [None]:
new_y

Checking the Value Counts of the New Target Variable

 ## 5.6 Combining the Resampled Data

In [None]:
df2=pd.DataFrame(new_x)
df3=pd.DataFrame(new_y)
df4=pd.concat([df2,df3],axis=1)
df4

## 5.7 Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2, f_regression #chi2 only when features are non negative

In [None]:
model2 = SelectKBest(score_func=f_classif)

In [None]:
feature_score = model2.fit(new_x,new_y)

In [None]:
feature_score.scores_

In [None]:
cols = pd.DataFrame(feature_score.scores_ , columns=['Feature_Scores'])
col2 = pd.DataFrame(new_x.columns, columns=['Feature_Names'])
scores = pd.concat([col2,cols],axis=1)
new = scores.nlargest(18,'Feature_Scores')
new

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x='Feature_Scores',y='Feature_Names',data=new)
plt.title('Feature ranking using SelectKBest classifier',fontsize=18)

## 5.8 FEATURE SCALING 

In [None]:

from sklearn.preprocessing import MinMaxScaler

In [None]:
mmx = MinMaxScaler() 

In [None]:
scaled_x = mmx.fit_transform(new_x)

In [None]:
from sklearn.decomposition import PCA

## 5.9 Applying PCA for Dimensionality Reduction

In [None]:
pca = PCA(n_components=3)
x_pca = pca.fit_transform(scaled_x)

In [None]:
x_pca

In [None]:
features = pd.DataFrame(x_pca, columns=['pca1','pca2','pca3'])

In [None]:
pip install plotly

In [None]:
import plotly.express as pl
pl.scatter_3d(features, x='pca1',y='pca2',z='pca3', color='pca1')

## 5.10 Split Dataset into Train and Test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(new_x,new_y,train_size=0.75,random_state=42)

In [None]:
xtest.head()

In [None]:
ytest.head()

# 6. Building Machine Learning Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

## 6.1 Logistic Regression

### 6.1.1 Logistic Regression on Resampled Data

In [None]:

# 1. Train the model
#log = LogisticRegression(max_iter=1000)
log = LogisticRegression()
log.fit(xtrain, ytrain)

# 2. Make predictions
y_pred = log.predict(xtest)

# 3. Evaluate
print("Accuracy:", accuracy_score(ytest, y_pred))
print("Precision:", precision_score(ytest, y_pred))
print("Recall:", recall_score(ytest, y_pred))
print("F1 Score:", f1_score(ytest, y_pred))
print("Confusion Matrix:\n", confusion_matrix(ytest, y_pred))

# ROC Curve
y_prob = log.predict_proba(xtest)[:,1]
fpr, tpr, thresholds = roc_curve(ytest, y_prob)

# Plot the ROC curve
plt.figure(figsize=(10, 7))
plt.plot(fpr, tpr, label=f'ROC AUC = {roc_auc_score(ytest, y_prob):.2f}')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

print("ROC AUC Score:", roc_auc_score(ytest, y_prob))


### 6.1.2 Logistic Regression using PCA

In [None]:
# Splitting the data
xtrain_pca, xtest_pca, ytrain_pca, ytest_pca = train_test_split(x_pca, new_y, train_size=0.75, random_state=42)

# Train the model
log_pca = LogisticRegression()
log_pca.fit(xtrain_pca, ytrain_pca)

# Predict and Evaluate
y_pred_pca = log_pca.predict(xtest_pca)
print("Model using PCA-transformed features:")
print("Accuracy:", accuracy_score(ytest_pca, y_pred_pca))
print("Precision:", precision_score(ytest_pca, y_pred_pca))
print("Recall:", recall_score(ytest_pca, y_pred_pca))
print("F1 Score:", f1_score(ytest_pca, y_pred_pca))


### 6.1.2 Logistic Regression using Selectkbest

In [None]:
selected_features_skb = new['Feature_Names'].tolist()
x_selected_skb = new_x[selected_features_skb]

# Splitting the data
xtrain_skb, xtest_skb, ytrain_skb, ytest_skb = train_test_split(x_selected_skb, new_y, train_size=0.75, random_state=42)

# Train the model
log_skb = LogisticRegression()
log_skb.fit(xtrain_skb, ytrain_skb)

# Predict and Evaluate
y_pred_skb = log_skb.predict(xtest_skb)
print("\nModel using features selected by SelectKBest:")
print("Accuracy:", accuracy_score(ytest_skb, y_pred_skb))
print("Precision:", precision_score(ytest_skb, y_pred_skb))
print("Recall:", recall_score(ytest_skb, y_pred_skb))
print("F1 Score:", f1_score(ytest_skb, y_pred_skb))


In [None]:
pred_log=log.predict(xtest)
pred_log_pca=log_pca.predict(xtest_pca)
pred_log_skb=log_skb.predict(xtest_skb)

In [None]:
print("Report of Logistic Regression:\n\n",classification_report(ytest, pred_log))
print("Report of Logistic Regression using PCA:\n\n",classification_report(ytest, pred_log_pca))
print("Report of Logistic Regression using Selectkbest:\n\n",classification_report(ytest, pred_log_skb))

## 6.2 Gradient Boost

### 6.2.1 Gradient Boost on Resampled Data

In [None]:
gb = GradientBoostingClassifier(learning_rate=0.3)
gb.fit(xtrain, ytrain)
y_pred_gbhyp = gb.predict(xtest)
print("Accuracy for Gradient Boosting:", accuracy_score(ytest, y_pred_gbhyp))
print("Precision for Gradient Boosting:", precision_score(ytest, y_pred_gbhyp))
print("Recall for Gradient Boosting:", recall_score(ytest, y_pred_gbhyp))
print("F1 Score for Gradient Boosting:", f1_score(ytest, y_pred_gbhyp))
print("Confusion Matrix for Gradient Boosting:\n", confusion_matrix(ytest, y_pred_gbhyp))
y_prob_gbhyp = gb.predict_proba(xtest)[:,1]
fpr_gb, tpr_gb, thresholds_gb = roc_curve(ytest, y_prob_gbhyp)
plt.plot(fpr_gb, tpr_gb)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Gradient Boosting')
plt.show()

print("ROC AUC Score for Gradient Boosting:", roc_auc_score(ytest, y_prob_gbhyp))



### 6.2.2 Gradient Boost Using PCA

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Splitting the data
xtrain_pca, xtest_pca, ytrain_pca, ytest_pca = train_test_split(x_pca, new_y, train_size=0.75, random_state=42)

# Train the model
gb_pca = GradientBoostingClassifier()
gb_pca.fit(xtrain_pca, ytrain_pca)

# Predict and Evaluate
y_pred_pca = gb_pca.predict(xtest_pca)
print("Model using PCA-transformed features:")
print("Accuracy:", accuracy_score(ytest_pca, y_pred_pca))
print("Precision:", precision_score(ytest_pca, y_pred_pca))
print("Recall:", recall_score(ytest_pca, y_pred_pca))
print("F1 Score:", f1_score(ytest_pca, y_pred_pca))


### 6.2.3 Gradient Boost Using Seleckbest

In [None]:
selected_features_skb = new['Feature_Names'].tolist()
x_selected_skb = new_x[selected_features_skb]

# Splitting the data
xtrain_skb, xtest_skb, ytrain_skb, ytest_skb = train_test_split(x_selected_skb, new_y, train_size=0.75, random_state=42)

# Train the model
gb_skb = GradientBoostingClassifier()
gb_skb.fit(xtrain_skb, ytrain_skb)

# Predict and Evaluate
y_pred_skb = gb_skb.predict(xtest_skb)
print("\nModel using features selected by SelectKBest:")
print("Accuracy:", accuracy_score(ytest_skb, y_pred_skb))
print("Precision:", precision_score(ytest_skb, y_pred_skb))
print("Recall:", recall_score(ytest_skb, y_pred_skb))
print("F1 Score:", f1_score(ytest_skb, y_pred_skb))


In [None]:
pred_gb=gb.predict(xtest)
pred_gb_hyp=y_pred_gbhyp
pred_gb_pca=gb_pca.predict(xtest_pca)
pred_gb_skb=gb_skb.predict(xtest_skb)

In [None]:
print("Report of Gradient Boost:\n\n",classification_report(ytest, pred_gb))
print("Report of Gradient Boost using Hyperparameter Tuning:\n\n",classification_report(ytest, pred_gb_hyp))
print("Report of Gradient Boost using PCA:\n\n",classification_report(ytest, pred_gb_pca))
print("Report of Gradient Boost using Selectkbest:\n\n",classification_report(ytest, pred_gb_skb))

## 6.3 Random Forest

### 6.3.1 Random Forest on sampled data

In [None]:
# 1. Train the model
rf = RandomForestClassifier()
rf.fit(xtrain, ytrain)

# 2. Make predictions
y_pred_rf = rf.predict(xtest)

# 3. Evaluate
print("Accuracy for Random Forest:", accuracy_score(ytest, y_pred_rf))
print("Precision for Random Forest:", precision_score(ytest, y_pred_rf))
print("Recall for Random Forest:", recall_score(ytest, y_pred_rf))
print("F1 Score for Random Forest:", f1_score(ytest, y_pred_rf))
print("Confusion Matrix for Random Forest:\n", confusion_matrix(ytest, y_pred_rf))

# ROC Curve for Random Forest
y_prob_rf = rf.predict_proba(xtest)[:,1]
fpr_rf, tpr_rf, thresholds_rf = roc_curve(ytest, y_prob_rf)
plt.plot(fpr_rf, tpr_rf)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Random Forest')
plt.show()

print("ROC AUC Score for Random Forest:", roc_auc_score(ytest, y_prob_rf))


### 6.3.2 Random Forest using PCA

In [None]:

# Splitting PCA transformed data again
xtrain_pca, xtest_pca, ytrain_pca, ytest_pca = train_test_split(x_pca, new_y, train_size=0.75, random_state=42)

# 1. Train the model on PCA data
rf_pca = RandomForestClassifier(random_state=42)
rf_pca.fit(xtrain_pca, ytrain_pca)

# 2. Make predictions
y_pred_rf_pca = rf_pca.predict(xtest_pca)

# 3. Evaluate
results = {
    "Accuracy": accuracy_score(ytest_pca, y_pred_rf_pca),
    "Precision": precision_score(ytest_pca, y_pred_rf_pca),
    "Recall": recall_score(ytest_pca, y_pred_rf_pca),
    "F1 Score": f1_score(ytest_pca, y_pred_rf_pca),
    "Confusion Matrix": confusion_matrix(ytest_pca, y_pred_rf_pca)
}

# ROC Curve for Random Forest on PCA data
y_prob_rf_pca = rf_pca.predict_proba(xtest_pca)[:,1]
fpr_rf_pca, tpr_rf_pca, thresholds_rf_pca = roc_curve(ytest_pca, y_prob_rf_pca)

plt.figure(figsize=(8, 6))
plt.plot(fpr_rf_pca, tpr_rf_pca, label='Random Forest (area = {:.2f})'.format(roc_auc_score(ytest_pca, y_prob_rf_pca)))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Random Forest on PCA data')
plt.legend(loc='best')
plt.show()

roc_auc = roc_auc_score(ytest_pca, y_prob_rf_pca)

results, roc_auc


### 6.3.3 Random Forset USing GridsearchCV

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(xtrain, ytrain)
best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(xtest)
print("Accuracy for Random Forest:", accuracy_score(ytest, y_pred_rf))
print("Precision for Random Forest:", precision_score(ytest, y_pred_rf))
print("Recall for Random Forest:", recall_score(ytest, y_pred_rf))
print("F1 Score for Random Forest:", f1_score(ytest, y_pred_rf))
print("Confusion Matrix for Random Forest:\n", confusion_matrix(ytest, y_pred_rf))
y_prob_rf = best_rf.predict_proba(xtest)[:,1]
fpr_rf, tpr_rf, thresholds_rf = roc_curve(ytest, y_prob_rf)
plt.plot(fpr_rf, tpr_rf)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Random Forest')
plt.show()

print("ROC AUC Score for Random Forest:", roc_auc_score(ytest, y_prob_rf))


## 6.4 Deep Learning

### 6.4.1 Deep Learning on Resampled Data

In [None]:

model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(xtrain.shape[1],)))  # Input layer
model.add(Dense(64, activation='relu'))  # Hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(xtrain, ytrain, epochs=10, batch_size=32, validation_data=(xtest, ytest))

y_pred_dl = model.predict(xtest)
y_pred_dl_class = (y_pred_dl > 0.5).astype(int).flatten()

print("Accuracy for Deep Learning Model:", accuracy_score(ytest, y_pred_dl_class))
print("Precision for Deep Learning Model:", precision_score(ytest, y_pred_dl_class))
print("Recall for Deep Learning Model:", recall_score(ytest, y_pred_dl_class))
print("F1 Score for Deep Learning Model:", f1_score(ytest, y_pred_dl_class))
print("Confusion Matrix for Deep Learning Model:\n", confusion_matrix(ytest, y_pred_dl_class))


fpr_dl, tpr_dl, thresholds_dl = roc_curve(ytest, y_pred_dl)
plt.plot(fpr_dl, tpr_dl)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Deep Learning Model')
plt.show()

print("ROC AUC Score for Deep Learning Model:", roc_auc_score(ytest, y_pred_dl))



### 6.4.2 Deep Learning using PCA

In [None]:


model_pca = Sequential()
model_pca.add(Dense(128, activation='relu', input_shape=(xtrain_pca.shape[1],)))  # Input layer
model_pca.add(Dense(64, activation='relu'))  # Hidden layer
model_pca.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
model_pca.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_pca.fit(xtrain_pca, ytrain, epochs=10, batch_size=32, validation_data=(xtest_pca, ytest))
y_pred_dl_pca = model_pca.predict(xtest_pca)
y_pred_dl_class_pca = (y_pred_dl_pca > 0.5).astype(int).flatten()

print("Accuracy for Deep Learning Model with PCA:", accuracy_score(ytest, y_pred_dl_class_pca))
print("Precision for Deep Learning Model with PCA:", precision_score(ytest, y_pred_dl_class_pca))
print("Recall for Deep Learning Model with PCA:", recall_score(ytest, y_pred_dl_class_pca))
print("F1 Score for Deep Learning Model with PCA:", f1_score(ytest, y_pred_dl_class_pca))
print("Confusion Matrix for Deep Learning Model with PCA:\n", confusion_matrix(ytest, y_pred_dl_class_pca))
fpr_dl_pca, tpr_dl_pca, thresholds_dl_pca = roc_curve(ytest, y_pred_dl_pca)

plt.figure(figsize=(10, 7))
plt.plot(fpr_dl_pca, tpr_dl_pca, label=f'ROC AUC with PCA = {roc_auc_score(ytest, y_pred_dl_pca):.2f}')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Deep Learning Model with PCA')
plt.legend(loc='lower right')
plt.show()


### 6.4.3 Deep Learning Using Selectkbest

In [None]:
selected_features = new['Feature_Names'].head(10).tolist()  # You can adjust the number of features if needed.
x_selected_train = xtrain[selected_features]
x_selected_test = xtest[selected_features]

model_selected = Sequential()
model_selected.add(Dense(128, activation='relu', input_shape=(x_selected_train.shape[1],)))  # Input layer
model_selected.add(Dense(64, activation='relu'))  # Hidden layer
model_selected.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
model_selected.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_selected.fit(x_selected_train, ytrain, epochs=10, batch_size=32, validation_data=(x_selected_test, ytest))
y_pred_dl_selected = model_selected.predict(x_selected_test)
y_pred_dl_class_selected = (y_pred_dl_selected > 0.5).astype(int).flatten()

print("Accuracy for Deep Learning Model with Selected Features:", accuracy_score(ytest, y_pred_dl_class_selected))
print("Precision for Deep Learning Model with Selected Features:", precision_score(ytest, y_pred_dl_class_selected))
print("Recall for Deep Learning Model with Selected Features:", recall_score(ytest, y_pred_dl_class_selected))
print("F1 Score for Deep Learning Model with Selected Features:", f1_score(ytest, y_pred_dl_class_selected))
print("Confusion Matrix for Deep Learning Model with Selected Features:\n", confusion_matrix(ytest, y_pred_dl_class_selected))
fpr_dl_selected, tpr_dl_selected, thresholds_dl_selected = roc_curve(ytest, y_pred_dl_selected)

plt.figure(figsize=(10, 7))
plt.plot(fpr_dl_selected, tpr_dl_selected, label=f'ROC AUC with Selected Features = {roc_auc_score(ytest, y_pred_dl_selected):.2f}')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Deep Learning Model with Selected Features')
plt.legend(loc='lower right')
plt.show()


## 6.5 Gaussian Naive Bayes

### 6.5.1 Gaussian Naive Bayes on Resample data

In [None]:


# Initializing and training the Gaussian Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(xtrain, ytrain)

# Predicting the test set results
y_pred_gnb = gnb.predict(xtest)

# 3. Evaluate
print("Accuracy for GNB:", accuracy_score(ytest, y_pred_gnb))
print("Precision for GNB:", precision_score(ytest, y_pred_gnb))
print("Recall for GNB:", recall_score(ytest, y_pred_gnb))
print("F1 Score for GNB:", f1_score(ytest, y_pred_gnb))
print("Confusion Matrix for GNB:\n", confusion_matrix(ytest, y_pred_gnb))

# ROC Curve for GNB
y_prob_gnb = gnb.predict_proba(xtest)[:,1]
fpr_gnb, tpr_gnb, thresholds_gnb = roc_curve(ytest, y_prob_gnb)
plt.plot(fpr_gnb, tpr_gnb)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - GNB')
plt.show()

print("ROC AUC Score for GNB:", roc_auc_score(ytest, y_prob_gnb))

### 6.5.2 Gaussian Naive Bayes Using PCA

In [None]:
xtrain_pca, xtest_pca, ytrain_pca, ytest_pca = train_test_split(x_pca, new_y, train_size=0.75, random_state=42)

gnb = GaussianNB()
gnb.fit(xtrain_pca, ytrain_pca)
y_pred_gnb_pca = gnb.predict(xtest_pca)

print("Accuracy for GNB with PCA:", accuracy_score(ytest_pca, y_pred_gnb_pca))
print("Precision for GNB with PCA:", precision_score(ytest_pca, y_pred_gnb_pca))
print("Recall for GNB with PCA:", recall_score(ytest_pca, y_pred_gnb_pca))
print("F1 Score for GNB with PCA:", f1_score(ytest_pca, y_pred_gnb_pca))
print("Confusion Matrix for GNB with PCA:\n", confusion_matrix(ytest_pca, y_pred_gnb_pca))
y_prob_gnb_pca = gnb.predict_proba(xtest_pca)[:,1]
fpr_gnb_pca, tpr_gnb_pca, thresholds_gnb_pca = roc_curve(ytest_pca, y_prob_gnb_pca)

plt.figure(figsize=(10, 7))
plt.plot(fpr_gnb_pca, tpr_gnb_pca, label=f'ROC AUC with PCA = {roc_auc_score(ytest_pca, y_prob_gnb_pca):.2f}')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for GNB with PCA')
plt.legend(loc='lower right')
plt.show()


### 6.5.3 Gaussian Naive Bayes using Selectkbest

In [None]:
selected_features = new['Feature_Names'].head(10).tolist()
x_selected = df4[selected_features]
xtrain_selected, xtest_selected, ytrain_selected, ytest_selected = train_test_split(x_selected, new_y, train_size=0.75, random_state=42)
gnb_selected = GaussianNB()
gnb_selected.fit(xtrain_selected, ytrain_selected)
y_pred_gnb_selected = gnb_selected.predict(xtest_selected)

print("Accuracy for GNB with Selected Features:", accuracy_score(ytest_selected, y_pred_gnb_selected))
print("Precision for GNB with Selected Features:", precision_score(ytest_selected, y_pred_gnb_selected))
print("Recall for GNB with Selected Features:", recall_score(ytest_selected, y_pred_gnb_selected))
print("F1 Score for GNB with Selected Features:", f1_score(ytest_selected, y_pred_gnb_selected))
print("Confusion Matrix for GNB with Selected Features:\n", confusion_matrix(ytest_selected, y_pred_gnb_selected))
y_prob_gnb_selected = gnb_selected.predict_proba(xtest_selected)[:,1]
fpr_gnb_selected, tpr_gnb_selected, thresholds_gnb_selected = roc_curve(ytest_selected, y_prob_gnb_selected)

plt.figure(figsize=(10, 7))
plt.plot(fpr_gnb_selected, tpr_gnb_selected, label=f'ROC AUC with Selected Features = {roc_auc_score(ytest_selected, y_prob_gnb_selected):.2f}')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for GNB with Selected Features')
plt.legend(loc='lower right')
plt.show()


### 6.5.4 Gaussian Naive Bayes usingh GridsearchCV

In [None]:
param_grid = {'var_smoothing': np.logspace(0,-9, num=100)}

gnb = GaussianNB()
grid_search = GridSearchCV(gnb, param_grid, scoring='accuracy', cv=5)
grid_search.fit(xtrain_selected, ytrain_selected)
best_gnb = grid_search.best_estimator_
best_params = grid_search.best_params_
y_pred_best_gnb = best_gnb.predict(xtest_selected)

print("Accuracy for Best GNB:", accuracy_score(ytest_selected, y_pred_best_gnb))
print("Precision for Best GNB:", precision_score(ytest_selected, y_pred_best_gnb))
print("Recall for Best GNB:", recall_score(ytest_selected, y_pred_best_gnb))
print("F1 Score for Best GNB:", f1_score(ytest_selected, y_pred_best_gnb))
print("Confusion Matrix for Best GNB:\n", confusion_matrix(ytest_selected, y_pred_best_gnb))
y_prob_best_gnb = best_gnb.predict_proba(xtest_selected)[:,1]
fpr_best_gnb, tpr_best_gnb, thresholds_best_gnb = roc_curve(ytest_selected, y_prob_best_gnb)

plt.figure(figsize=(10, 7))
plt.plot(fpr_best_gnb, tpr_best_gnb, label=f'ROC AUC for Best GNB = {roc_auc_score(ytest_selected, y_prob_best_gnb):.2f}')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Best GNB')
plt.legend(loc='lower right')
plt.show()
