Import the relevant packages

In [61]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

specify the labels

In [62]:
comorbidities = ["obesity"]     # just one for the progress report, will be more in the future

load in data

In [63]:
input_df = pd.read_csv("model_input.csv")
print(input_df)

                                                notes label
0   ['Patient', 'gender', 'female', 'They', 'born'...     N
1   ['Patient', 'gender', 'female', 'They', 'born'...     N
2   ['Patient', 'gender', 'female', 'They', 'born'...     N
3   ['Patient', 'gender', 'female', 'They', 'born'...     N
4   ['Patient', 'gender', 'male', 'They', 'born', ...     N
..                                                ...   ...
95  ['Patient', 'gender', 'male', 'They', 'born', ...     N
96  ['Patient', 'gender', 'male', 'They', 'born', ...     N
97  ['Patient', 'gender', 'female', 'They', 'born'...     N
98  ['Patient', 'gender', 'male', 'They', 'born', ...     N
99  ['Patient', 'gender', 'female', 'They', 'born'...     N

[100 rows x 2 columns]


make it more readable as input for our model

In [64]:
input_df["notes"] = [' '.join(word_list) for word_list in input_df["notes"].apply(ast.literal_eval)]
input_df["label"] = (input_df["label"] == 'Y').astype(int)
print(input_df)

                                                notes  label
0   Patient gender female They born 2094-03-05 00:...      0
1   Patient gender female They born 2090-06-05 00:...      0
2   Patient gender female They born 2038-09-03 00:...      0
3   Patient gender female They born 2075-09-21 00:...      0
4   Patient gender male They born 2114-06-20 00:00...      0
..                                                ...    ...
95  Patient gender male They born 2057-11-15 00:00...      0
96  Patient gender male They born 1878-05-14 00:00...      0
97  Patient gender female They born 2078-06-16 00:...      0
98  Patient gender male They born 2107-06-27 00:00...      0
99  Patient gender female They born 2112-10-22 00:...      0

[100 rows x 2 columns]


Create training and validation split

In [65]:
split = 50
train_df = input_df[:split]
valid_df = input_df[split:]
print(train_df)
print(valid_df)

                                                notes  label
0   Patient gender female They born 2094-03-05 00:...      0
1   Patient gender female They born 2090-06-05 00:...      0
2   Patient gender female They born 2038-09-03 00:...      0
3   Patient gender female They born 2075-09-21 00:...      0
4   Patient gender male They born 2114-06-20 00:00...      0
5   Patient gender female They born 1895-05-17 00:...      0
6   Patient gender female They born 2108-01-15 00:...      1
7   Patient gender male They born 2061-04-10 00:00...      0
8   Patient gender male They born 2050-03-29 00:00...      0
9   Patient gender female They born 2051-04-21 00:...      0
10  Patient gender male They born 2053-04-13 00:00...      0
11  Patient gender female They born 1885-03-24 00:...      0
12  Patient gender female They born 2056-01-27 00:...      0
13  Patient gender female They born 2061-10-23 00:...      0
14  Patient gender male They born 2076-05-06 00:00...      0
15  Patient gender male 

In [66]:
count_vectorizer = CountVectorizer(stop_words='english')
train_X = count_vectorizer.fit_transform(train_df["notes"]).toarray()
valid_X = count_vectorizer.transform(valid_df["notes"]).toarray()
print(train_X.shape)
print(valid_X.shape)
train_Y = np.array([train_df["label"]])  # is a 2d numpy array, with a row for each comorbidity
valid_Y = np.array([valid_df["label"]])   # ^^^
print(train_Y.shape)
print(valid_Y.shape)

(50, 491)
(50, 491)
(1, 50)
(1, 50)


Make predictions and print out f1 score based for each comorbidity

In [67]:
train_f1s, valid_f1s = np.empty(len(comorbidities)), np.empty(len(comorbidities))
train_pred_Y, valid_pred_Y = np.empty(train_Y.shape), np.empty(valid_Y.shape)

for i in range(len(comorbidities)):     # iterates across each comorbidity
    comorbidity, train_y, valid_y = comorbidities[i], train_Y[i], valid_Y[i]
    count_clf = LogisticRegression(max_iter=10000).fit(train_X, train_y) # train the classifier

    train_pred_Y[i] = count_clf.predict(train_X)
    train_f1s[i] = f1_score(train_y, train_pred_Y[i])
    print(f"{comorbidity} training f1: {train_f1s[i]}")

    valid_pred_Y[i] = count_clf.predict(valid_X)
    valid_f1s[i] = f1_score(valid_y, valid_pred_Y[i])
    print(f"{comorbidity} validation f1: {valid_f1s[i]}")

flat_train_Y, flat_train_pred_Y = train_Y.flatten(), train_pred_Y.flatten()
overall_train_f1 = f1_score(flat_train_Y, flat_train_pred_Y)
tn, fp, fn, tp = confusion_matrix(flat_train_Y, flat_train_pred_Y).ravel()
print("\nTraining")
print(f"\toverall f1: {overall_train_f1}")
print(f"\ttn: {tn}\tfp: {fp}\tfn: {fn}\ttp: {tp}")

flat_valid_Y, flat_valid_prep_Y = valid_Y.flatten(), valid_pred_Y.flatten()
overall_valid_f1 = f1_score(flat_valid_Y, flat_valid_prep_Y)
tn, fp, fn, tp = confusion_matrix(flat_valid_Y, flat_valid_prep_Y).ravel()
print("\nValidation")
print(f"\tf1: {overall_valid_f1}")
print(f"\ttn: {tn}\tfp: {fp}\tfn: {fn}\ttp: {tp}")

obesity training f1: 0.8571428571428571
obesity validation f1: 0.0

Training
	overall f1: 0.8571428571428571
	tn: 46	fp: 0	fn: 1	tp: 3

Validation
	f1: 0.0
	tn: 42	fp: 1	fn: 7	tp: 0
