In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as  sns
import plotly.express as px
import numpy as np
import warnings
from sklearn.naive_bayes import CategoricalNB, GaussianNB, MultinomialNB
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import RocCurveDisplay, roc_curve, accuracy_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_excel('Diabetes Study File 10K Dec 14 2017.xlsx')
data.head()

Unnamed: 0,Age_at_Exam,sBP,sBP_Date,BMI,BMI_Date,LDL,LDL_Date,HDL,HDL_Date,A1c,...,leastO(A1c_Date),leastO(DM_OnsetDate),leastO(FBS_Date),LeastOfAll,A1C_BEF_DM,FBS_BEF_DM,Patient_ID,DM_Onset_Revised,DM_Onset_Revised_1YrPrior,DIABETES
0,65,126.0,2013-06-11,31.0,2013-06-11,1.66,2013-06-14,1.11,2013-06-14,5.4,...,,,,,,,4001000000255903,NaT,NaT,No
1,62,135.0,2014-06-19,25.846483,2014-10-17,2.49,2014-05-28,1.37,2014-05-28,5.8,...,,,,,,,4001000000256456,NaT,NaT,No
2,63,133.0,2012-07-31,30.9,2011-12-01,1.65,2012-06-01,,NaT,6.1,...,,,,,,,1001000000000054,NaT,NaT,No
3,51,136.0,2014-01-06,56.710775,2014-01-06,2.8,2014-01-14,1.94,2014-01-14,6.0,...,,,,,,,4001000000259496,NaT,NaT,No
4,40,123.0,2015-06-12,33.067867,2015-06-12,2.48,2015-06-24,1.17,2015-06-24,5.8,...,,,,,,,4001000000262094,NaT,NaT,No


In [None]:
print(data.shape)

In [None]:
# Check how many patients there are
data['Patient_ID'].unique().shape

In [None]:
# Check types of variables
data.dtypes

In [None]:
# Count the number of missing values (NaN or None) in each column of the 'data' DataFrame
missing_value_counts = data.isnull().sum()
missing_value_counts

In [4]:
# Replace categorical values with numerical equivalents
data['DIABETES'].replace({'Yes' : 1, 'No' : 0}, inplace=True)
data['Sex'].replace({'Female' : 1, 'Male' : 0}, inplace=True)

In [None]:
# Define a list of colors
colors = ['#1c3a73', '#7cb1c2']

# Create the grouped bar chart using seaborn's countplot function with specified colors
plt.figure(figsize=(6, 5))
sns.countplot(data=data, x='Sex', hue='DIABETES', palette=colors)

plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Count of Each Class of Diabetes for Male and Female')
plt.legend(title='Diabetes')
plt.show()

In [None]:
print('Oldest individual in the dataset was of:', data['Age_at_Exam'].max(), 'Years')
print('Youngest individual in the dataset was of:', data['Age_at_Exam'].min(), 'Years')
print('Average age in the dataset:', data['Age_at_Exam'].mean(), 'Years')

In [5]:
# Create a subset with no date
df = data[['Age_at_Exam', 'sBP', 'BMI', 'LDL', 'HDL', 'A1c', 'TG', 'FBS', 'Total_Cholesterol', 'Depression',
     'HTN', 'OA', 'COPD', 'Hypertension_Medications', 'Corticosteroids', 'Sex', 'DIABETES']]

In [None]:
# Imputation
df.isnull().sum()

In [30]:
# define features and outcome
X = df.drop(['DIABETES', 'Hypertension_Medications', 'Corticosteroids'], axis=1)
y = df[['DIABETES']]

# Normalizing data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

Y_train = Y_train.to_numpy()
Y_test = Y_test.to_numpy()

In [7]:
# Imputation
imputer = IterativeImputer(max_iter=10, random_state=42)
X_train_filled_mice = imputer.fit_transform(X_train)
X_test_filled_mice = imputer.fit_transform(X_test)

In [8]:
# Check multicollinearity
# VIF dataframe
X_train_filled_mice_df = pd.DataFrame(X_train_filled_mice)
vif_data = pd.DataFrame()
vif_data["feature"] = X_train_filled_mice_df.columns

  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X_train_filled_mice_df.values, i)
                          for i in range(len(X_train_filled_mice_df.columns))]
  
print(vif_data)

    feature        VIF
0         0   1.330047
1         1   1.127975
2         2   1.175673
3         3  24.608886
4         4   5.167766
5         5   2.526503
6         6   5.347915
7         7   2.597591
8         8  33.386108
9         9   1.050524
10       10   1.224992
11       11   1.110830
12       12   1.035749
13       13   1.212322


In [None]:
# take a look at training X
for col in X.columns:
    print(col)
    
# LDL, A1c, and Total_Cholestrol have high VIF
# A1c strongly correlated with diabetes bc many doctors do not order it before a patient becomes diabetic (from variable description)

In [20]:
# Define a function to compute various scores for model evaluation
def get_scores(Y_pred, Y):
    # Calculate confusion matrix, classification report, ROC AUC, and accuracy
    conf_matrix = confusion_matrix(Y_pred, Y)
    class_report = classification_report(Y_pred, Y)
    auc = roc_auc_score(Y_pred, Y)
    acc = accuracy_score(Y_pred, Y)

    # Return the computed scores
    return conf_matrix, class_report, auc, acc

In [32]:
# Logistic regression feature selection

# Initialize a Logistic Regression model
model = LogisticRegression()

# Define a StratifiedKFold cross-validation
cv = StratifiedKFold(3)

# Initialize Recursive Feature Elimination with Cross-Validation (RFECV)
rfecv = RFECV(
    estimator=model,
    step=1,
    cv=cv,
    scoring='roc_auc',  # Specify the scoring metric (ROC AUC in this case)
    min_features_to_select=1,
)
rfecv.fit(X_train_filled_mice, Y_train)

# Print the optimal number of selected features
print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 7


In [11]:
data_y

Unnamed: 0,DIABETES
9254,1
1561,0
1670,0
6087,1
6669,1
...,...
5734,1
5191,1
5390,1
860,0
