# Diabetes Predictor Notebook

# 6a)

In [41]:
# Import the class from preprocessors/load_data.py
from preprocessors.load_data import DiabetesDataLoader

# Create an instance of the class and call a method
loader = DiabetesDataLoader(data_dir='data', file_name='sample_diabetes_mellitus_data.csv')


In [42]:
train_df, test_df = loader.load_and_split_data()

# 6b)

In [44]:
# Import the class
from preprocessors.drop_na_preprocessor import Preprocessor_drop

# Create an instance
preprocessors_drop = Preprocessor_drop()


In [45]:
train_df_non_na = preprocessors_drop.remove_nan_rows(train_df)
test_df_non_na =  preprocessors_drop.remove_nan_rows(test_df)

# 6c)

In [46]:
# Import the class
from preprocessors.fill_na_preprocessor import Preprocessor_fill

# Create an instance
preprocessors_fill = Preprocessor_fill()


In [47]:
train_df_clean = preprocessors_fill.fill_nan_with_mean(train_df_non_na)
test_df_clean = preprocessors_fill.fill_nan_with_mean(test_df_non_na)

Processing column: height
Before conversion to numeric: 
8813    157.5
6645    162.5
8781    170.2
3766    188.0
6522    172.7
Name: height, dtype: float64
After conversion to numeric: 
8813    157.5
6645    162.5
8781    170.2
3766    188.0
6522    172.7
Name: height, dtype: float64
Mean of 'height': 170.05401114706942
After filling NaN: 
8813    157.5
6645    162.5
8781    170.2
3766    188.0
6522    172.7
Name: height, dtype: float64
Processing column: weight
Before conversion to numeric: 
8813    130.2
6645     70.0
8781     78.6
3766    108.8
6522     72.6
Name: weight, dtype: float64
After conversion to numeric: 
8813    130.2
6645     70.0
8781     78.6
3766    108.8
6522     72.6
Name: weight, dtype: float64
Mean of 'weight': 86.8587002992732
After filling NaN: 
8813    130.2
6645     70.0
8781     78.6
3766    108.8
6522     72.6
Name: weight, dtype: float64
Processing column: height
Before conversion to numeric: 
5097    166.4
2580    162.6
260     170.2
6781    165.1
5468   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = pd.to_numeric(df[column], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].fillna(mean_value)


# 6 d)

In [48]:
# Import the class
from feature_extractors.feature_transformer_1 import GenderFeature,EthnicityFeature

# Create an Instance
GenderFeature_creation = GenderFeature()
EthnicityFeature_creation = EthnicityFeature()

In [49]:
#Feature 1 creation
train_df_feature_1 = GenderFeature_creation.transform(train_df_clean)
test_df_feature_1 = GenderFeature_creation.transform(test_df_clean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'gender_encoded'] = df['gender'].apply(lambda x: 1 if x == 'M' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'gender_encoded'] = df['gender'].apply(lambda x: 1 if x == 'M' else 0)


In [50]:
#Feature 2 creation
train_df_feature_2 = EthnicityFeature_creation.transform(train_df_feature_1)
test_df_feature_2 = EthnicityFeature_creation.transform(test_df_feature_1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'ethnicity_encoded'] = df['ethnicity'].map(ethnicity_mapping).fillna(-1).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'ethnicity_encoded'] = df['ethnicity'].map(ethnicity_mapping).fillna(-1).astype(int)


# 6 e)

In [51]:
# Import the class
from models.model import Model

In [52]:
# Create an Instance

#feature columns and target column
target_column = 'diabetes_mellitus'
feature_columns = ['age', 'gender_encoded', 'ethnicity_encoded','height', 'weight']
##
model_instance = Model(feature_columns=feature_columns, target_column=target_column)


In [53]:
# Count the number of NaN values in each of the specified columns
nan_count = train_df_feature_2[feature_columns].isna().sum()

# Display the count of NaN values in each column
print(nan_count)


age                  0
gender_encoded       0
ethnicity_encoded    0
height               0
weight               0
dtype: int64


In [54]:
# Train the model
model_instance.train(train_df_feature_2)

In [55]:
# predict probabilities
predicted_probabilities = model_instance.predict(test_df_feature_2)

In [57]:

from sklearn.metrics import roc_auc_score

# Extracting probabilities and adding them to a new column
predicted_prob_class_1 = predicted_probabilities[:, 1]
test_df_feature_2['predictions'] = predicted_prob_class_1

#Compute ROC AUC 
true_labels = test_df_feature_2['diabetes_mellitus'] 

roc_auc = roc_auc_score(true_labels, test_df_feature_2['predictions'])

# Display the ROC AUC score
print(f"ROC AUC Score: {roc_auc}")

# Display the test DataFrame with the new 'predictions' column
test_df_feature_2


ROC AUC Score: 0.6824595525308912


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_feature_2['predictions'] = predicted_prob_class_1


Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,...,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus,gender_encoded,ethnicity_encoded,predictions
5097,259593,33,60.0,,1,Caucasian,F,166.4,Operating Room,Operating Room / Recovery,...,0,0,0,0,0,0,1,0,0,0.191279
2580,176913,118,59.0,29.010438,0,Caucasian,M,162.6,Direct Admit,Accident & Emergency,...,0,0,0,0,0,0,0,1,0,0.216516
260,263807,118,57.0,17.222429,0,Caucasian,M,170.2,Emergency Department,Accident & Emergency,...,0,0,0,0,0,0,0,1,0,0.104148
6781,184749,118,72.0,19.113648,0,Caucasian,F,165.1,Emergency Department,Other Hospital,...,0,0,0,0,0,0,0,0,0,0.128563
5468,223536,118,70.0,17.651056,1,Caucasian,M,177.8,Operating Room,Operating Room / Recovery,...,0,0,0,0,0,1,0,1,0,0.125810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7746,213402,118,58.0,21.920710,0,Caucasian,F,170.2,Emergency Department,Operating Room / Recovery,...,0,0,0,0,0,0,0,0,0,0.106906
2740,223551,33,67.0,27.800093,1,Caucasian,F,162.6,Operating Room,Operating Room / Recovery,...,0,0,0,0,0,0,1,0,0,0.186165
1745,225541,118,54.0,33.867134,0,African American,M,188.0,Direct Admit,Accident & Emergency,...,0,0,0,0,0,0,0,1,2,0.294806
6969,151520,69,74.0,25.419145,0,Caucasian,F,172.0,Emergency Department,Accident & Emergency,...,0,0,0,0,0,0,0,0,0,0.175815
