In [2]:
pip install pandas numpy matplotlib seaborn scikit-learn jupyter

Note: you may need to restart the kernel to use updated packages.
Collecting matplotlib
  Downloading matplotlib-3.10.3-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.2-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.58.4-cp313-cp313-win_amd64.whl.metadata (108 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.8-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-11.2.1-cp313-cp313-win_amd64.whl.metadata (9.1 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.3-py3-none-any


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
# Heart Disease Dataset Modeling

# ------------------------------
# 1. Import Libraries
# ------------------------------
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# ------------------------------
# 2. Load and Preprocess Data
# ------------------------------
df = pd.read_csv(r"C:\Users\aryan\machinel\Assignment1\Data\heart_disease_uci(1).csv")  # Adjust filename if needed

# Convert types if needed
df['sex'] = df['sex'].astype(str)
df['fbs'] = df['fbs'].astype(str)
df['exang'] = df['exang'].astype(str)

# Encode target
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

# One-hot encoding
df = pd.get_dummies(df, drop_first=True)

# Clean column names
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace('-', '_')
df.columns = df.columns.str.replace('(', '')
df.columns = df.columns.str.replace(')', '')

# ------------------------------
# 3. Regression (ElasticNet)
# ------------------------------
X_reg = df.drop(columns=['num', 'target'])
y_reg = df['num']

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

model = ElasticNet()
model.fit(X_train_reg, y_train_reg)

# Predict & Evaluate
y_pred = model.predict(X_test_reg)
mse = mean_squared_error(y_test_reg, y_pred)
r2 = r2_score(y_test_reg, y_pred)

print("Regression Results (ElasticNet):")
print(f"Mean Squared Error: {mse:.3f}")
print(f"R² Score: {r2:.3f}\n")

# ------------------------------
# 4. Classification - Logistic Regression
# ------------------------------
X_class = df.drop(columns=['num', 'target'])
y_class = df['target']

X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_log, y_train_log)

# Predict & Evaluate
y_pred_log = logreg.predict(X_test_log)
print("Classification Results (Logistic Regression):")
print("Accuracy:", accuracy_score(y_test_log, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test_log, y_pred_log))
ConfusionMatrixDisplay.from_predictions(y_test_log, y_pred_log)
plt.title("Confusion Matrix: Logistic Regression")
plt.show()

# ------------------------------
# 5. Classification - k-NN
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
print("Classification Results (k-NN):")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_knn)
plt.title("Confusion Matrix: k-NN")
plt.show()

# ------------------------------
# 6. Conclusion (Markdown)
# ------------------------------
# Include the conclusion in a markdown cell in your notebook:
#
# ## Conclusion
# - ElasticNet achieved an R^2 of 0.399 and MSE of 0.146
# - Logistic Regression achieved 91.7% accuracy
# - k-NN achieved 70.0% accuracy
# - Logistic regression outperformed k-NN in all classification metrics
# - Logistic Regression is a better fit for this dataset
 # Verify data loaded

ValueError: Input X contains NaN.
ElasticNet does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [17]:
import pandas as pd

# Load dataset
df = pd.read_csv(r'C:\Users\aryan\machinel\Assignment1\Data\heart_disease_uci(1).csv')
print("Column names:", df.columns.tolist())
# Handle missing values
df = df.dropna()

# Encode categorical variables
df = pd.get_dummies(df, columns=['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal'], drop_first=True)

# Print to verify encoded columns
print("Shape after preprocessing:", df.shape)
print("Encoded columns:", df.columns.tolist())
df = pd.get_dummies(df, columns=['Sex', 'ChestPain', 'FastingBS', 'RestECG', 'ExerciseAngina', 'ST_Slope', 'Thal'], drop_first=True)

Column names: ['age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
Shape after preprocessing: (299, 20)
Encoded columns: ['age', 'dataset', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num', 'sex_Male', 'cp_atypical angina', 'cp_non-anginal', 'cp_typical angina', 'fbs_True', 'restecg_normal', 'restecg_st-t abnormality', 'exang_True', 'slope_flat', 'slope_upsloping', 'thal_normal', 'thal_reversable defect']


KeyError: "None of [Index(['Sex', 'ChestPain', 'FastingBS', 'RestECG', 'ExerciseAngina',\n       'ST_Slope', 'Thal'],\n      dtype='object')] are in the [columns]"