### Big Data Project

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Read the dataset and rename columns
df = pd.read_csv("/kaggle/input/lovess/heart_disease.tab", sep="\t", header=0, names=["age", "gender", "chest pain", "rest BP", "cholesterol", "fasting blood sugar > 120", "rest ECG", "max heart rate", "exerc ind ang", "ST depression", "slope peak exc ST", "number vessels", "thal", "target"])

# Select rows after the first two
df = df.iloc[2:,:]
df.head(3)


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/lovess/heart_disease.tab'

# Data Pre Processing and Cleaning

In [None]:
# Make a copy of the original DataFrame
df1 = df.copy()

# Remove rows with '?' values
df1 = df1.replace('?', pd.NA).dropna()

# Select categorical columns
categorical_columns = ['gender', 'chest pain', 'fasting blood sugar > 120', 'rest ECG', 'exerc ind ang', 'slope peak exc ST', 'thal']

# One-hot encode categorical columns
encoded_cols = pd.get_dummies(df1[categorical_columns], drop_first=True)

# Drop the original categorical columns from the data and concatenate the encoded ones
df1 = pd.concat([df1.drop(categorical_columns, axis=1), encoded_cols], axis=1)

# Convert data types if needed
df1 = df1.astype(float)

# Visualizing data to understand it and choose the best features

In [None]:
# Histograms for Numeric Variables
numeric_vars = ['age', 'rest BP', 'cholesterol', 'max heart rate', 'ST depression']
df1[numeric_vars].hist(figsize=(12, 8))
plt.suptitle('Histograms for Numeric Variables', y=0.95)
plt.show()

# Count Plot for Categorical Variables
plt.figure(figsize=(12, 8))
for i, var in enumerate(categorical_columns, 1):
    plt.subplot(2, 4, i)
    sns.countplot(x=var, data=df)
    plt.title(f'Count Plot for {var}')
plt.tight_layout()
plt.show()

# Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df1.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

# Pairplot
sns.pairplot(df1[numeric_vars])
plt.suptitle('Pairplot for Numeric Variables', y=1.02)
plt.show()



# Spliting the data, building and testing the ML models 

In [None]:
# Split data into features and target
X = df1.drop('target', axis=1)
y = df1['target']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)
logistic_regression_pred = logistic_regression_model.predict(X_test)
logistic_regression_accuracy = accuracy_score(y_test, logistic_regression_pred)

print("Logistic Regression Accuracy:", logistic_regression_accuracy)

# Random Forest
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
random_forest_pred = random_forest_model.predict(X_test)
random_forest_accuracy = accuracy_score(y_test, random_forest_pred)

print("Random Forest Accuracy:", random_forest_accuracy)

# Feature Importance Plot (Random Forest)
plt.figure(figsize=(10, 6))
feat_importances = pd.Series(random_forest_model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.title('Top 10 Feature Importances (Random Forest)')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.show()

# Diplaying the confusion matrix and how the models perform


In [None]:
from sklearn.metrics import confusion_matrix
import itertools

# Function to plot confusion matrix
def plot_confusion_matrix(cm, classes, title, cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Generate confusion matrix for Logistic Regression
logistic_regression_cm = confusion_matrix(y_test, logistic_regression_pred)

# Plot confusion matrix for Logistic Regression
plt.figure(figsize=(8, 6))
plot_confusion_matrix(logistic_regression_cm, classes=['0', '1'], title='Confusion Matrix - Logistic Regression')
plt.show()

# Generate confusion matrix for Random Forest
random_forest_cm = confusion_matrix(y_test, random_forest_pred)

# Plot confusion matrix for Random Forest
plt.figure(figsize=(8, 6))
plot_confusion_matrix(random_forest_cm, classes=['0', '1'], title='Confusion Matrix - Random Forest')
plt.show()
