In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, multilabel_confusion_matrix, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download('stopwords')

In [None]:
train_df = pd.read_csv("input.csv")

train_df.drop(columns=['Unnamed: 0'])

# yes = pd.read_csv("yes.csv")
# yes = yes["sequence"].to_list()

# no = pd.read_csv("no.csv")
# no = no["sequence"].to_list()

# neither = pd.read_csv("neither.csv")
# neither = neither["sequence"].to_list()

In [None]:
test_df = pd.read_csv("input_test.csv")

test_df.drop(columns=['Unnamed: 0'])

In [None]:
# train_yes = yes[:int(0.85 * len(yes))]
# test_yes = yes[int(0.85 * len(yes)):]

# train_no = no[:int(0.85 * len(no))]
# test_no = no[int(0.85 * len(no)):]

# train_neither = neither[:int(0.85 * len(neither))]
# test_neither = neither[int(0.85 * len(neither)):]

# train_x = train_yes + train_no + train_neither
# test_x = test_yes + test_no + test_neither

# train_y = np.append(np.ones((len(train_yes), 1)), np.zeros((len(train_neg), 1)), axis=0)
# test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [None]:
# Create a transformation pipeline
# The pipeline sequentially applies a list of transforms and as a final estimator logistic regression 
pipeline_log = Pipeline([
                    ('count', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LogisticRegression(solver='lbfgs', multi_class='auto')),
                ])

# Train model using the created sklearn pipeline
model_name = 'logistic regression classifier'
model_lgr = pipeline_log.fit(train_df['sequence'], train_df['label'])

In [None]:
def evaluate_results(model, test_df):
    # Predict class labels using the learner function
    test_df['pred'] = model_lgr.predict(test_df['sequence'])
    y_true = test_df['label']
    y_pred = test_df['pred']
    target_names = ['NO', 'NTR', 'YES']

    # Print the Confusion Matrix
    results_log = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    results_df_log = pd.DataFrame(results_log).transpose()
    print(results_df_log)
    matrix = confusion_matrix(y_true,  y_pred)
    sns.heatmap(pd.DataFrame(matrix), 
                annot=True, fmt="d", linewidths=.5, cmap="YlGnBu")
    plt.xlabel('Predictions')
    plt.xlabel('Actual')

    model_score = score(y_pred, y_true, average='macro')
    return model_score

In [None]:
# Evaluate model performance
model_score = evaluate_results(model_lgr, test_df)
performance_df = pd.DataFrame().append({'model_name': model_name, 
                                    'f1_score': model_score[0], 
                                    'precision': model_score[1], 
                                    'recall': model_score[2]}, ignore_index=True) 

In [None]:
model_name = 'bayes classifier'

pipeline_bayes = Pipeline([
                    ('count', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('gnb', MultinomialNB()),
                ])

# Train model using the created sklearn pipeline
model_bayes = pipeline_bayes.fit(train_df['sequence'], train_df['label'])

# Evaluate model performance
model_score = evaluate_results(model_bayes, test_df)
performance_df = performance_df.append({'model_name': model_name, 
                                    'f1_score': model_score[0], 
                                    'precision': model_score[1], 
                                    'recall': model_score[2]}, ignore_index=True)

In [None]:
prediction_sequences = ['PHQ-2 Score: 0 Cognition Negative: no evidence of cognitive decline noted by patient or family; no memory problems causing dysfunction in daily activities Falls risk Time to rise from, walk 10 feet,', 
                       'depression, but certainly does not appear depressed on exam - Dementia: MMSE on 5/21/16 23/30 c/w Mild cognitive impairment, which is NOT c/w profound weight loss - Gastroparesis: Hx of diabetes',
                       'THEY DO NOT HAVE DEMENTIA',
                       'tojguiegbhutrebjg bljtmhtnoery0og[wob erjbgt4iu5gbyi ]']
for seq in prediction_sequences:
    ans = model_lgr.predict([seq])
    d = {1: 'Negative', 2: 'Neither', 3: 'Positive'}
    print(seq + '-> ' + d[ans[0]], "\n")

In [None]:
prediction_sequences = ['PHQ-2 Score: 0 Cognition Negative: no evidence of cognitive decline noted by patient or family; no memory problems causing dysfunction in daily activities Falls risk Time to rise from, walk 10 feet,', 
                       'depression, but certainly does not appear depressed on exam - Dementia: MMSE on 5/21/16 23/30 c/w Mild cognitive impairment, which is NOT c/w profound weight loss - Gastroparesis: Hx of diabetes',
                       'THEY DO NOT HAVE DEMENTIA',
                       'tojguiegbhutrebjg bljtmhtnoery0og[wob erjbgt4iu5gbyi ]']
for seq in prediction_sequences:
    ans = model_bayes.predict([seq])
    d = {1: 'Negative', 2: 'Neither', 3: 'Positive'}
    print(seq + '-> ' + d[ans[0]], "\n")