### Big Data Project

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Read the dataset and rename columns
df = pd.read_csv("/kaggle/input/lovess/heart_disease.tab", sep="\t", header=0, names=["age", "gender", "chest pain", "rest BP", "cholesterol", "fasting blood sugar > 120", "rest ECG", "max heart rate", "exerc ind ang", "ST depression", "slope peak exc ST", "number vessels", "thal", "target"])

# Select rows after the first two
df = df.iloc[2:,:]
df.head(3)


# Data Pre Processing and Cleaning

In [None]:
# Make a copy of the original DataFrame
df1 = df.copy()

# Remove rows with '?' values
df1 = df1.replace('?', pd.NA).dropna()

# Select categorical columns
categorical_columns = ['gender', 'chest pain', 'fasting blood sugar > 120', 'rest ECG', 'exerc ind ang', 'slope peak exc ST', 'thal']

# One-hot encode categorical columns
encoded_cols = pd.get_dummies(df1[categorical_columns], drop_first=True)

# Drop the original categorical columns from the data and concatenate the encoded ones
df1 = pd.concat([df1.drop(categorical_columns, axis=1), encoded_cols], axis=1)

# Convert data types if needed
df1 = df1.astype(float)

# Visualizing data to understand it and choose the best features

In [None]:
# Histograms for Numeric Variables
numeric_vars = ['age', 'rest BP', 'cholesterol', 'max heart rate', 'ST depression']
df1[numeric_vars].hist(figsize=(12, 8))
plt.suptitle('Histograms for Numeric Variables', y=0.95)
plt.show()

# Count Plot for Categorical Variables
plt.figure(figsize=(12, 8))
for i, var in enumerate(categorical_columns, 1):
    plt.subplot(2, 4, i)
    sns.countplot(x=var, data=df)
    plt.title(f'Count Plot for {var}')
plt.tight_layout()
plt.show()

# Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df1.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

# Pairplot
sns.pairplot(df1[numeric_vars])
plt.suptitle('Pairplot for Numeric Variables', y=1.02)
plt.show()

