# PIMA Knowledge Graph Creation

## Preprocessing and Data Aggregation


In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
#set variables according to docker instance
uri = "bolt://localhost:7687"
username = "neo4j"
password = "password"

In [8]:
#connect with driver 
from neo4j import GraphDatabase
with GraphDatabase.driver(uri,auth=(username,password)) as driver:
  driver.verify_connectivity()

In [9]:
#setup GDS
from re import U
from graphdatascience import GraphDataScience

gds = GraphDataScience(uri, auth=(username,password))

In [10]:
#set variables for projected graph embedding projection
graph_name = "pimaGraph"
embedding_dimension = 128

In [None]:
#project graph with all content under the specified graph_name
driver.execute_query(
    """
    CALL gds.graph.project(
        $graphName,
        {
            Sample: '*',
            medical_concept: '*',
            definition: '*',
            synonyms: '*'
        },
        {
            HAS_PREGNANCIES: {
            orientation: 'UNDIRECTED'  
            }, 
            HAS_GLUCOSE: {
            orientation: 'UNDIRECTED'  
            },
            HAS_BLOOD_PRESSURE: {
            orientation: 'UNDIRECTED'  
            },
            HAS_SKIN_THICKNESS: {
            orientation: 'UNDIRECTED'  
            },
            HAS_INSULIN: {
            orientation: 'UNDIRECTED'  
            }, 
            HAS_BMI: {
            orientation: 'UNDIRECTED'  
            },
            HAS_DIABETES_PEDIGREE_FUNCTION: {
            orientation: 'UNDIRECTED'  
            },
            HAS_AGE: {
            orientation: 'UNDIRECTED'  
            }, 
            Definition : {
            orientation: 'UNDIRECTED'  
            },
            Synonym: {
            orientation: 'UNDIRECTED'  
            },
            embedding_match_node: {
            orientation: 'UNDIRECTED'  
            }
        })
    """,
    graphName=graph_name
)


In [None]:
#get fastrp embeddings from projected graph with specified embeddingDimension
results = driver.execute_query(
    """
    CALL gds.fastRP.stream($graphName, {
        embeddingDimension: $embeddingDimension,
        randomSeed:42
    })
    YIELD nodeId, embedding
    RETURN nodeId, embedding;
    """,
    embeddingDimension=embedding_dimension, graphName=graph_name
)

In [14]:
#convert results into pandas dataframe
embedding_df = pd.DataFrame([{"nodeId": record["nodeId"], "embedding": record["embedding"]} for record in results[0]])

In [16]:
#view dataframe
embedding_df

Unnamed: 0,nodeId,embedding
0,1157,"[0.3707543611526489, 0.054082226008176804, -0...."
1,1159,"[0.3539871275424957, 0.05613316223025322, -0.1..."
2,0,"[0.4141680598258972, 0.03869418054819107, -0.1..."
3,1,"[0.371269166469574, 0.05201292037963867, -0.10..."
4,2,"[0.3478549122810364, 0.06399303674697876, -0.1..."
...,...,...
1152,1150,"[0.3686221241950989, 0.026663072407245636, -0...."
1153,1151,"[0.3833306133747101, 0.05250994861125946, -0.0..."
1154,1152,"[0.4162055253982544, 0.026111219078302383, -0...."
1155,1153,"[0.34819433093070984, 0.0923713967204094, 0.03..."


In [None]:
#recieve Samples and their corresponding ids (so we can get the ids and their respective embeddings)
results = driver.execute_query(
    """
    match (m:Sample) return  id(m), m.SampleNumber
    """
)

In [19]:
#convert samples into df 
sample_df = pd.DataFrame([{"nodeId": record["id(m)"], "sampleNumber": record["m.SampleNumber"]} for record in results[0]])

In [20]:
#view df
sample_df

Unnamed: 0,nodeId,sampleNumber
0,0,382
1,1,383
2,2,384
3,3,385
4,4,386
...,...,...
763,1152,377
764,1153,378
765,1156,379
766,1157,380


In [21]:
#merge dataframes
combined_df= sample_df.merge(embedding_df, on="nodeId", how="left")

In [22]:
#view dataframe
combined_df

Unnamed: 0,nodeId,sampleNumber,embedding
0,0,382,"[0.4141680598258972, 0.03869418054819107, -0.1..."
1,1,383,"[0.371269166469574, 0.05201292037963867, -0.10..."
2,2,384,"[0.3478549122810364, 0.06399303674697876, -0.1..."
3,3,385,"[0.4099091589450836, 0.027811255306005478, -0...."
4,4,386,"[0.3573867976665497, 0.0758630633354187, 0.027..."
...,...,...,...
763,1152,377,"[0.4162055253982544, 0.026111219078302383, -0...."
764,1153,378,"[0.34819433093070984, 0.0923713967204094, 0.03..."
765,1156,379,"[0.4478495121002197, 0.06772170960903168, -0.0..."
766,1157,380,"[0.3707543611526489, 0.054082226008176804, -0...."


In [23]:
#load bucketed dataset 
diabetes_df = pd.read_csv("data/diabetes_final.csv")

In [24]:
#view df
diabetes_df

Unnamed: 0,SampleNumber,Outcome,Pregnancies_Bucket,Glucose_Bucket,BloodPressure_Bucket,SkinThickness_Bucket,Insulin_Bucket,BMI_Bucket,DiabetesPedigreeFunction_Bucket,Age_Bucket
0,0,1,normal pregnancies,high glucose,normal blood pressure,high skin thickness,,high bmi,normal diabetes pedigree function,high age
1,1,0,low pregnancies,normal glucose,normal blood pressure,normal skin thickness,,high bmi,normal diabetes pedigree function,normal age
2,2,1,high pregnancies,high glucose,normal blood pressure,,,normal bmi,normal diabetes pedigree function,normal age
3,3,0,low pregnancies,normal glucose,normal blood pressure,normal skin thickness,normal insulin,high bmi,low diabetes pedigree function,low age
4,4,1,low pregnancies,high glucose,low blood pressure,high skin thickness,high insulin,high bmi,high diabetes pedigree function,normal age
...,...,...,...,...,...,...,...,...,...,...
763,763,0,high pregnancies,high glucose,normal blood pressure,high skin thickness,high insulin,high bmi,low diabetes pedigree function,high age
764,764,0,low pregnancies,high glucose,normal blood pressure,normal skin thickness,,high bmi,normal diabetes pedigree function,low age
765,765,0,normal pregnancies,high glucose,normal blood pressure,normal skin thickness,normal insulin,high bmi,low diabetes pedigree function,normal age
766,766,1,low pregnancies,high glucose,normal blood pressure,,,high bmi,normal diabetes pedigree function,high age


In [25]:
#one hot encode buckets and split embeddings
combined_df["sampleNumber"] = combined_df["sampleNumber"].astype(int)
combined_df = combined_df.merge(diabetes_df, left_on="sampleNumber", right_on="SampleNumber", how="left")
combined_df = combined_df.drop(columns=["sampleNumber"])
categorical_columns = ['Pregnancies_Bucket', 'Glucose_Bucket', 'BloodPressure_Bucket', 'SkinThickness_Bucket', 'Insulin_Bucket', 'BMI_Bucket', 'DiabetesPedigreeFunction_Bucket', 'Age_Bucket']
processed_df = pd.get_dummies(combined_df, columns=categorical_columns, dtype="int")
embedding_df = pd.DataFrame(processed_df["embedding"].tolist())
embedding_df.columns = [f"embedding_{i}" for i in range(128)]
final_df = pd.concat([processed_df, embedding_df], axis=1)

In [26]:
#view df
final_df

Unnamed: 0,nodeId,embedding,SampleNumber,Outcome,Pregnancies_Bucket_high pregnancies,Pregnancies_Bucket_low pregnancies,Pregnancies_Bucket_normal pregnancies,Glucose_Bucket_high glucose,Glucose_Bucket_low glucose,Glucose_Bucket_normal glucose,...,embedding_118,embedding_119,embedding_120,embedding_121,embedding_122,embedding_123,embedding_124,embedding_125,embedding_126,embedding_127
0,0,"[0.4141680598258972, 0.03869418054819107, -0.1...",382,0,0,1,0,1,0,0,...,-0.083448,0.172059,0.088233,0.206619,-0.007406,0.212477,0.029658,-0.024647,0.052844,-0.103768
1,1,"[0.371269166469574, 0.05201292037963867, -0.10...",383,0,0,1,0,0,0,1,...,-0.123794,0.151029,0.061160,0.154785,0.001946,0.171825,0.103335,0.017990,-0.007571,-0.114803
2,2,"[0.3478549122810364, 0.06399303674697876, -0.1...",384,0,0,1,0,1,0,0,...,-0.088545,0.199655,0.058721,0.136495,-0.044946,0.187021,0.042974,0.005521,0.059122,-0.160641
3,3,"[0.4099091589450836, 0.027811255306005478, -0....",385,0,0,1,0,1,0,0,...,-0.088704,0.242223,0.023763,0.111797,0.030774,0.150176,0.063015,-0.000636,0.053633,-0.177662
4,4,"[0.3573867976665497, 0.0758630633354187, 0.027...",386,1,0,0,1,1,0,0,...,-0.066840,0.182776,0.129548,0.206703,-0.061997,0.242070,0.059771,-0.012242,-0.007600,-0.151231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,1152,"[0.4162055253982544, 0.026111219078302383, -0....",377,0,0,1,0,0,0,1,...,-0.062308,0.200209,0.078861,0.161985,-0.032400,0.201345,0.040949,0.010791,-0.033868,-0.073606
764,1153,"[0.34819433093070984, 0.0923713967204094, 0.03...",378,1,0,0,1,1,0,0,...,-0.049506,0.229343,0.161155,0.272059,-0.061438,0.229603,0.063524,-0.005243,0.041468,-0.142256
765,1156,"[0.4478495121002197, 0.06772170960903168, -0.0...",379,0,0,1,0,0,0,1,...,-0.117071,0.217234,0.099096,0.223523,0.048493,0.165011,0.162497,-0.015010,-0.026475,-0.101034
766,1157,"[0.3707543611526489, 0.054082226008176804, -0....",380,0,0,1,0,1,0,0,...,-0.091443,0.208174,0.108655,0.221055,0.000519,0.197458,0.047120,-0.000868,0.022656,-0.060568


In [27]:
#optional - load final df if you want to do custom training on our embeddings (ie. you can skip the above sections and train on our embeddings if you want to skip the neo4j steps)
#final_df = pd.read_csv("data/final_df.csv")

In [28]:
#split dataset into x and y (y being outcome)
X = final_df.loc[:,~final_df.columns.isin(["Outcome"])]
y = final_df["Outcome"]

## Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,  StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [25]:
#setup parameter grid for gridsearch
param_grid = {
    'n_estimators': [100, 200, 300], 
    'max_depth': [5, 10, 15],       
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4]    
}

In [26]:
#setup rf model (using stratified cross validation and gridsearch)
rf = RandomForestClassifier(random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### With Buckets

In [30]:
#specify categorical columns
categorical_columns = [ 'Pregnancies_Bucket_high pregnancies',
       'Pregnancies_Bucket_low pregnancies',
       'Pregnancies_Bucket_normal pregnancies', 'Glucose_Bucket_high glucose',
       'Glucose_Bucket_low glucose', 'Glucose_Bucket_normal glucose',
       'BloodPressure_Bucket_high blood pressure',
       'BloodPressure_Bucket_low blood pressure',
       'BloodPressure_Bucket_normal blood pressure',
       'SkinThickness_Bucket_high skin thickness',
       'SkinThickness_Bucket_low skin thickness',
       'SkinThickness_Bucket_normal skin thickness',
       'Insulin_Bucket_high insulin', 'Insulin_Bucket_low insulin',
       'Insulin_Bucket_normal insulin', 'BMI_Bucket_high bmi',
       'BMI_Bucket_low bmi', 'BMI_Bucket_normal bmi',
       'DiabetesPedigreeFunction_Bucket_high diabetes pedigree function',
       'DiabetesPedigreeFunction_Bucket_low diabetes pedigree function',
       'DiabetesPedigreeFunction_Bucket_normal diabetes pedigree function',
       'Age_Bucket_high age', 'Age_Bucket_low age', 'Age_Bucket_normal age']


In [28]:
#select categorical columns only for bucketed model
X[categorical_columns]

Unnamed: 0,Pregnancies_Bucket_high pregnancies,Pregnancies_Bucket_low pregnancies,Pregnancies_Bucket_normal pregnancies,Glucose_Bucket_high glucose,Glucose_Bucket_low glucose,Glucose_Bucket_normal glucose,BloodPressure_Bucket_high blood pressure,BloodPressure_Bucket_low blood pressure,BloodPressure_Bucket_normal blood pressure,SkinThickness_Bucket_high skin thickness,...,Insulin_Bucket_normal insulin,BMI_Bucket_high bmi,BMI_Bucket_low bmi,BMI_Bucket_normal bmi,DiabetesPedigreeFunction_Bucket_high diabetes pedigree function,DiabetesPedigreeFunction_Bucket_low diabetes pedigree function,DiabetesPedigreeFunction_Bucket_normal diabetes pedigree function,Age_Bucket_high age,Age_Bucket_low age,Age_Bucket_normal age
0,0,1,0,1,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,1,0
1,0,1,0,0,0,1,0,0,1,0,...,1,1,0,0,1,0,0,0,1,0
2,0,1,0,1,0,0,0,0,1,0,...,1,0,0,1,0,1,0,0,1,0
3,0,1,0,1,0,0,0,1,0,0,...,1,0,0,1,0,1,0,0,1,0
4,0,0,1,1,0,0,0,0,1,0,...,0,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,0,1,0,0,0,1,0,0,1,1,...,1,1,0,0,0,0,1,0,1,0
764,0,0,1,1,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
765,0,1,0,0,0,1,1,0,0,1,...,1,1,0,0,1,0,0,0,0,1
766,0,1,0,1,0,0,0,0,1,1,...,1,1,0,0,1,0,0,0,1,0


In [29]:
#apply bucketed model and print out classification report
grid_search = GridSearchCV(rf, param_grid, scoring='accuracy', cv=cv) 
grid_search.fit(X[categorical_columns], y)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(X[categorical_columns])
print(classification_report(y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7804    0.8600    0.8183       500
           1     0.6774    0.5485    0.6062       268

    accuracy                         0.7513       768
   macro avg     0.7289    0.7043    0.7122       768
weighted avg     0.7445    0.7513    0.7443       768



In [31]:
#save best model for later use
joblib.dump(best_model, 'final_models/rfbucket.pkl')

['final_models/rfbucket.pkl']

### With Embeddings

In [31]:
#specify embedding columns
embedding_columns = [f'embedding_{i}' for i in range(embedding_dimension)]

In [33]:
#apply embedding model and print out classification report
grid_search = GridSearchCV(rf, param_grid, scoring='accuracy', cv=cv) 
grid_search.fit(X[embedding_columns], y)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(X[embedding_columns])
print(classification_report(y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8702    0.9120    0.8906       500
           1     0.8197    0.7463    0.7812       268

    accuracy                         0.8542       768
   macro avg     0.8450    0.8291    0.8359       768
weighted avg     0.8526    0.8542    0.8525       768



In [35]:
#save best model for later use
joblib.dump(best_model, 'final_models/rfemb.pkl')

['final_models/rfemb.pkl']

## XGBoost

In [36]:
from xgboost import XGBClassifier

#setup parameter grid for gridsearch
param_grid = {
  'n_estimators': [100, 200, 300], 
  'max_depth': [5, 10, 15],       
  'min_child_weight': [2, 5, 10], 
  'gamma': [0, 0.1, 0.5]          
}

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### With Buckets

In [37]:
#apply bucketed model and print out classification report
grid_search = GridSearchCV(xgb_model, param_grid, scoring='accuracy', cv=cv)
grid_search.fit(X[categorical_columns], y)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(X[categorical_columns])
print(classification_report(y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7876    0.8380    0.8120       500
           1     0.6568    0.5784    0.6151       268

    accuracy                         0.7474       768
   macro avg     0.7222    0.7082    0.7135       768
weighted avg     0.7419    0.7474    0.7433       768



In [38]:
#save best model for later use
joblib.dump(best_model, 'final_models/xgbbucket.pkl')

['final_models/xgbbucket.pkl']

### With Embeddings

In [39]:
#apply embedding model and print out classification report
grid_search = GridSearchCV(xgb_model, param_grid, scoring='accuracy', cv=cv)
grid_search.fit(X[embedding_columns], y)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(X[embedding_columns])
print(classification_report(y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8835    0.9100    0.8966       500
           1     0.8221    0.7761    0.7985       268

    accuracy                         0.8633       768
   macro avg     0.8528    0.8431    0.8475       768
weighted avg     0.8621    0.8633    0.8623       768



In [40]:
#save best model for later use
joblib.dump(best_model, 'final_models/xgbemb.pkl')

['final_models/xgbemb.pkl']

## SVMs

In [41]:
from sklearn.svm import SVC

#setup parameter grid for gridsearch
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.001, 0.01, 0.1]
}

svm = SVC(random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### With Buckets

In [42]:
#apply bucketed model and print out classification report
grid_search = GridSearchCV(svm, param_grid, scoring='accuracy', cv=cv) 
grid_search.fit(X[categorical_columns], y)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(X[categorical_columns])
print(classification_report(y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7936    0.7920    0.7928       500
           1     0.6134    0.6157    0.6145       268

    accuracy                         0.7305       768
   macro avg     0.7035    0.7038    0.7037       768
weighted avg     0.7307    0.7305    0.7306       768



In [43]:
#save best model for later use
joblib.dump(best_model, 'final_models/svmbucket.pkl')

['final_models/svmbucket.pkl']

### With Embeddings

In [44]:
#apply embedding model and print out classification report
grid_search = GridSearchCV(svm, param_grid, scoring='accuracy', cv=cv) 
grid_search.fit(X[embedding_columns], y)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(X[embedding_columns])
print(classification_report(y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7984    0.7840    0.7911       500
           1     0.6101    0.6306    0.6202       268

    accuracy                         0.7305       768
   macro avg     0.7042    0.7073    0.7057       768
weighted avg     0.7327    0.7305    0.7315       768



In [45]:
#save best model for later use
joblib.dump(best_model, 'final_models/svmemb.pkl')

['final_models/svmemb.pkl']

## Naive Bayes

In [46]:
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

#setup parameter grid for gridsearch
param_grid = {
  'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6] 
}

nb = GaussianNB()
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


### With Buckets

In [47]:
#apply bucketed model and print out classification report
grid_search = GridSearchCV(nb, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X[categorical_columns],y)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(X[categorical_columns])
print(classification_report(y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9179    0.5140    0.6590       500
           1     0.5020    0.9142    0.6481       268

    accuracy                         0.6536       768
   macro avg     0.7100    0.7141    0.6536       768
weighted avg     0.7728    0.6536    0.6552       768



In [49]:
#save best model for later use
joblib.dump(best_model, 'final_models/nbbucket.pkl')

['final_models/nbbucket.pkl']

### With Embeddings

In [50]:
#apply embedding model and print out classification report
grid_search = GridSearchCV(nb, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X[embedding_columns],y)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(X[embedding_columns])
print(classification_report(y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8501    0.6580    0.7418       500
           1     0.5512    0.7836    0.6471       268

    accuracy                         0.7018       768
   macro avg     0.7007    0.7208    0.6945       768
weighted avg     0.7458    0.7018    0.7088       768



In [52]:
#save best model for later use
joblib.dump(best_model, 'final_models/nbemb.pkl')

['final_models/nbemb.pkl']

## Neural Networks

In [53]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from sklearn.model_selection import KFold
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

In [55]:
#setup nn variables
tf.random.set_seed(42)
batch_size = 8
epochs = 30

In [56]:
#model defintion
def create_model():
    model = Sequential()
    model.add(Dense(64, activation="sigmoid"))
    model.add(BatchNormalization())
    model.add(Dropout(0.25))
    model.add(Dense(128, activation="sigmoid"))
    model.add(BatchNormalization())
    model.add(Dropout(0.25))
    model.add(Dense(64, activation="sigmoid"))
    model.add(BatchNormalization())
    model.add(Dropout(0.25))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer="adam")
    return model

In [32]:
from sklearn.model_selection import train_test_split

#split dataset using a 80/10/10 split
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

### With Buckets

In [58]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint


# Convert data to numpy arrays
X_train_data = np.array(X_train[categorical_columns])
y_train_data = np.array(y_train)
X_val_data = np.array(X_val[categorical_columns])
y_val_data = np.array(y_val)
X_test_data = np.array(X_test[categorical_columns])
y_test_data = np.array(y_test)

checkpoint_file = 'final_models/nnbucket.keras'


# Create a new instance of the model
model = create_model()

# Define callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.0001)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(checkpoint_file, monitor='val_loss', save_best_only=True, mode='min')

# Train the model
hist = model.fit(
    X_train_data, y_train_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(X_val_data, y_val_data),
    callbacks=[reduce_lr, early_stopping, model_checkpoint]
)

# Evaluate the model on the test set
test_prediction_accuracy = model.evaluate(X_test_data, y_test_data)[1]
print(f"Test Accuracy: {test_prediction_accuracy}")

Epoch 1/30
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6080 - loss: 0.7821 - val_accuracy: 0.6494 - val_loss: 0.6513 - learning_rate: 0.0010
Epoch 2/30
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6712 - loss: 0.6448 - val_accuracy: 0.6494 - val_loss: 0.6320 - learning_rate: 0.0010
Epoch 3/30
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7058 - loss: 0.5708 - val_accuracy: 0.6494 - val_loss: 0.6120 - learning_rate: 0.0010
Epoch 4/30
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6940 - loss: 0.6001 - val_accuracy: 0.6234 - val_loss: 0.5928 - learning_rate: 0.0010
Epoch 5/30
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6968 - loss: 0.5707 - val_accuracy: 0.6364 - val_loss: 0.5885 - learning_rate: 0.0010
Epoch 6/30
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/s

In [36]:
from tensorflow.keras.models import load_model
import numpy as np
from sklearn.metrics import classification_report

#load best model to get classification report
X_test_data = np.array(X_test[categorical_columns])
y_test_data = np.array(y_test)
model = load_model('final_models/nnbucket.keras')

y_pred = model.predict(X_test_data)  
y_pred_2 = (y_pred>.5).astype(int) 

report = classification_report(y_test_data, y_pred_2, digits=4)
print(report)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
              precision    recall  f1-score   support

           0     0.7288    0.8600    0.7890        50
           1     0.6111    0.4074    0.4889        27

    accuracy                         0.7013        77
   macro avg     0.6700    0.6337    0.6389        77
weighted avg     0.6875    0.7013    0.6838        77



### With Embeddings

In [59]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

# Convert data to numpy arrays
X_train_data = np.array(X_train[embedding_columns])
y_train_data = np.array(y_train)
X_val_data = np.array(X_val[embedding_columns])
y_val_data = np.array(y_val)
X_test_data = np.array(X_test[embedding_columns])
y_test_data = np.array(y_test)

checkpoint_file = 'final_models/nnemb.keras'


# Create a new instance of the model
model = create_model()

# Define callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.0001)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(checkpoint_file, monitor='val_loss', save_best_only=True, mode='min')

# Train the model
hist = model.fit(
    X_train_data, y_train_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(X_val_data, y_val_data),
    callbacks=[reduce_lr, early_stopping, model_checkpoint]
)

# Evaluate the model on the test set
test_prediction_accuracy = model.evaluate(X_test_data, y_test_data)[1]
print(f"Test Accuracy: {test_prediction_accuracy}")

Epoch 1/30
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6386 - loss: 0.7480 - val_accuracy: 0.3506 - val_loss: 0.7385 - learning_rate: 0.0010
Epoch 2/30
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6542 - loss: 0.6616 - val_accuracy: 0.6494 - val_loss: 0.6789 - learning_rate: 0.0010
Epoch 3/30
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6989 - loss: 0.5632 - val_accuracy: 0.6494 - val_loss: 0.6480 - learning_rate: 0.0010
Epoch 4/30
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7045 - loss: 0.5740 - val_accuracy: 0.6494 - val_loss: 0.6270 - learning_rate: 0.0010
Epoch 5/30
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7099 - loss: 0.5584 - val_accuracy: 0.6494 - val_loss: 0.6018 - learning_rate: 0.0010
Epoch 6/30
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/s

In [35]:
from tensorflow.keras.models import load_model
import numpy as np
from sklearn.metrics import classification_report

#load best model to get classification report
X_test_data = np.array(X_test[embedding_columns])
y_test_data = np.array(y_test)
model = load_model('final_models/nnemb.keras')

y_pred = model.predict(X_test_data)  
y_pred_2 = (y_pred>.5).astype(int) 

report = classification_report(y_test_data, y_pred_2, digits=4)
print(report)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
              precision    recall  f1-score   support

           0     0.7719    0.8800    0.8224        50
           1     0.7000    0.5185    0.5957        27

    accuracy                         0.7532        77
   macro avg     0.7360    0.6993    0.7091        77
weighted avg     0.7467    0.7532    0.7429        77

