<a href="https://colab.research.google.com/github/angelahjhong/project-2/blob/main/CHD_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
! git clone https://github.com/angelahjhong/project-2

fatal: destination path 'project-2' already exists and is not an empty directory.


In [42]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

train_data = pd.read_csv("/content/project-2/cleaned_train_binary.csv")
# created a correlation matrix to graph our heatmap to find a significant
# correlation between our datasets
corr_matrix = train_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap of Variables')
plt.show()


In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

train_data = pd.read_csv("/content/project-2/cleaned_train_binary.csv")
test_data = pd.read_csv("/content/project-2/cleaned_test_binary.csv")

vars = ['sex', 'age', 'currentSmoker', 'cigsPerDay', 'prevalentStroke',
                 'prevalentHyp', 'diabetes', 'BMI', 'glucose', 'TenYearCHD']

train_data = train_data[vars]
test_data = test_data[vars]

# pulling our variables
train_data = pd.get_dummies(train_data, columns=['sex', 'currentSmoker',
                                                 'prevalentStroke', 'prevalentHyp', 'diabetes'])
test_data = pd.get_dummies(test_data, columns=['sex', 'currentSmoker',
                                               'prevalentStroke', 'prevalentHyp', 'diabetes'])

# in this code we are dropping the TenYearCHD variable because this is what we are running our predictions against
X_train = train_data.drop(columns=['TenYearCHD'])
y_train = train_data['TenYearCHD']
X_test = test_data.drop(columns=['TenYearCHD'])
y_test = test_data['TenYearCHD']

# initializing our models
linear_model = LinearRegression()
decision_tree_model = DecisionTreeRegressor(max_depth=2)  # Create a regression object
knn_model = KNeighborsRegressor()

# training models calling the tree funcs
linear_model.fit(X_train, y_train)
decision_tree_model.fit(X_train, y_train)  # Fit the regression
knn_model.fit(X_train, y_train)

# running our predictions based on our pull variable array
linear_pred = linear_model.predict(X_test)
decision_tree_pred = decision_tree_model.predict(X_test)
knn_pred = knn_model.predict(X_test)

# calculating our R^2 scores using a built in function
linear_rsq = r2_score(y_test, linear_pred)
decision_tree_rsq = r2_score(y_test, decision_tree_pred)
knn_rsq = r2_score(y_test, knn_pred)

# calculating the RMSE using a built in function
linear_rmse = mean_squared_error(y_test, linear_pred, squared=False )
decision_tree_rmse = mean_squared_error(y_test, decision_tree_pred, squared=False)
knn_rmse = mean_squared_error(y_test, knn_pred, squared=False)

print("Linear Regression rsq:", linear_rsq)
print("Decision Tree Regression rsq:", decision_tree_rsq)
print("K-Nearest Neighbors Regression rsq:", knn_rsq)

print("\nLinear Regression rmse:", linear_rmse)
print("Decision Tree Regression rmse:", decision_tree_rmse)
print("K-Nearest Neighbors Regression rmse:", knn_rmse)

Linear Regression rsq: 0.08944393608080903
Decision Tree Regression rsq: 0.03571913268397675
K-Nearest Neighbors Regression rsq: -0.07517052147477288

Linear Regression rmse: 0.3389543263724163
Decision Tree Regression rmse: 0.34881054922784993
K-Nearest Neighbors Regression rmse: 0.3683210248662036


In [50]:
# violin plot graph for age and sex vs CHD
df = pd.read_csv("/content/project-2/cleaned_train.csv")

plt.figure(figsize=(10, 6))
sns.violinplot(x='TenYearCHD', y='age', hue='sex', data=df, split=True, palette='viridis')
plt.title('Violin Plot of Age and Sex vs CHD')
plt.xlabel('TenYearCHD')
plt.ylabel('Age')
plt.legend(title='Sex', loc='upper right')
plt.show()

In [45]:
# desity plot for CHD and age
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df, x='age', hue='TenYearCHD', fill=True, common_norm=False, palette='Set1')
plt.title('Density Plot of Age vs CHD')
plt.xlabel('Age')
plt.ylabel('Density')
plt.legend(title='TenYearCHD', labels=['No CHD', 'CHD'], loc='upper right')
plt.show()

In [49]:
# bar graph for cigsperday vs CHD
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='cigsPerDay', y='TenYearCHD', color='b')
plt.title('CHD Cases vs. Cigarettes Per Day')
plt.xlabel('Cigarettes Per Day')
plt.ylabel('CHD Cases')
plt.xticks(rotation=45)  # Rotate x-axis labels by 45 degrees
plt.grid(axis='y')
plt.show()

In [47]:
# scatterplot graph for BMI vs glucose vs CHD

In [48]:
# tbd