This notebook includes 2 main parts:

- Perform Longformer on clinical text to find its numerical presentation

- Perform Logistic classification in two situations

In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import LongformerTokenizer, LongformerModel

In [None]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Read data

In [None]:
clinical_text_train = pd.read_csv('training_text',sep="\|\|",engine="python",names=["ID","Text"],skiprows=1)
clinical_text_train.head()

In [None]:
clinical_text_test = pd.read_csv('test_text',sep="\|\|",engine="python",names=["ID","Text"],skiprows=1)
clinical_text_test.head()

In [None]:
variants_train = pd.read_csv("training_variants_cleaned.csv")

In [None]:
variants_train.head()


# Longformer

In [None]:
# Load pre-trained longformer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained("allenai/longformer-base-4096", gradient_checkpointing=True)
model.to(device)

In [None]:
# Choose the 7th layer of encoder for training data
embedding_list = np.zeros((1,768))
cnt = 0
with torch.no_grad():
    for abstr in clinical_text_train["Text"]:
        print(cnt)
        cnt+=1  
        if isinstance(abstr, str):
            encoded_input = tokenizer(abstr, return_tensors="pt", max_length=4096, truncation=True)
            encoded_input.to(device)
            output = model(**encoded_input, output_hidden_states=True)
            embedding_list=np.append(embedding_list, [torch.mean(output[2][7][0], dim=0).tolist()], axis=0)
        else:
            embedding_list=np.append(embedding_list, np.zeros((1,768)), axis=0)

embedding_list = embedding_list[1:,:]
variants_text_train = pd.DataFrame(embedding_list, index=clinical_text_train.ID)
variants_text_train.reset_index(inplace=True)
variants_text_train.to_csv("training_clinical_text_embedding.csv", index=False)
variants_text_train.head()

In [None]:
# Choose the 7th layer of encoder for training data
embedding_list = np.zeros((1,768))
cnt = 0
with torch.no_grad():
    for abstr in clinical_text_test["Text"]:
    print(cnt)
    cnt+=1  
    if isinstance(abstr, str):
        encoded_input = tokenizer(abstr, return_tensors="pt", max_length=4096, truncation=True)
        encoded_input.to(device)
        output = model(**encoded_input, output_hidden_states=True)
        embedding_list=np.append(embedding_list, [torch.mean(output[2][7][0], dim=0).tolist()], axis=0)
    else:
        embedding_list=np.append(embedding_list, np.zeros((1,768)), axis=0)
embedding_list = embedding_list[1:,:]
variants_text_test = pd.DataFrame(embedding_list, index=clinical_text_test.ID)
variants_text_test.reset_index(inplace=True)
variants_text_test.head()
variants_text_test.to_csv("test_clinical_text_embedding.csv", index=False)

# Combine data for training

In [None]:
## Processed Features
train_feat_variant_df = pd.read_csv("training_variants_cleaned.csv")
train_feat_variant_df = 1*train_feat_variant_df
variants_text_train = pd.read_csv("training_clinical_text_embedding.csv")
variants_text_train.rename(columns=lambda x: "col_"+str(x) if str(x) != "ID" else x, inplace=True)
train_feat_df = train_feat_variant_df.merge(variants_text_train, on="ID", how="left")

test_feat_variant_df = pd.read_csv("test_variants_cleaned.csv")
test_feat_variant_df=1*test_feat_variant_df
variants_text_test = pd.read_csv("test_clinical_text_embedding.csv")
variants_text_test.rename(columns=lambda x: "col_"+str(x) if str(x) != "ID" else x, inplace=True)
test_feat_df = test_feat_variant_df.merge(variants_text_test, on="ID", how="left")

In [None]:
## Target
train_target_df = pd.read_csv("training_variants")
train_target_df = train_target_df[["ID","Class"]].copy()

test_ID_df = pd.read_csv("test_variants")
test_target_df = pd.read_csv("stage1_solution_filtered.csv")
for i in range(9):
    test_target_df["class"+str(i+1)] = (i+1)*test_target_df["class"+str(i+1)]
test_target_df["Class"] = test_target_df[[i for i in test_target_df.columns if i != "ID"]].sum(axis=1)
test_target_df = test_target_df[["ID", "Class"]].copy()

In [None]:
# Train
train_df = train_feat_df.merge(train_target_df, on="ID")
train_df.dropna(how="any", inplace=True)
train_df.describe(include="all").T

In [None]:
# Test
test_df = test_feat_df.merge(test_target_df, on="ID")
test_df.dropna(how="any", inplace=True)
test_df.describe(include="all").T

# Simple classification

In [None]:
# logistics regression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import accuracy_score, log_loss
from sklearn.linear_model import SGDClassifier
import numpy as np

# alpha = [0.001, 0.0001, 0.00001, 0.000001, 0.0000001, 0.00000001]
alpha = [0.0001]
for i in alpha:
    print("for C =", i)
    clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=42)
    clf.fit(train_df[[i for i in train_df.columns if i not in ["Variation_old", "ID"]]]
            , train_df["Class"])
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_df[[i for i in train_df.columns if i not in ["Variation_old", "ID", "Class"]]], train_df["Class"])
    pred_class = sig_clf.predict_proba(train_df[[i for i in train_df.columns if i not in ["Variation_old", "ID", "Class"]]])
    print("Log Loss :",log_loss(train_df["Class"], pred_class)) 
    pred_class_ = np.argmax(pred_class, axis=1)
    # plot_confusion_matrix(sig_clf, X_test, y_test)  

    pred_test_class = sig_clf.predict_proba(test_df[[i for i in train_df.columns if i not in ["Variation_old", "ID", "Class"]]])
    print("Log Loss :",log_loss(test_df["Class"], pred_test_class)) 
    pred_test_class_ = np.argmax(pred_test_class, axis=1)



In [None]:
plot_confusion_matrix(sig_clf, train_df[[i for i in train_df.columns if i not in ["Variation_old", "ID", "Class"]]]
                      , train_df["Class"])
# 56.9%

In [None]:
plot_confusion_matrix(sig_clf, test_df[[i for i in train_df.columns if i not in ["Variation_old", "ID", "Class"]]]
                      , test_df["Class"])
# 88.7%

In [None]:
# logistics regression

# alpha = [0.001, 0.0001, 0.00001, 0.000001, 0.0000001, 0.00000001]
alpha = [0.001]
for i in alpha:
    print("for C =", i)
    clf_base = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=42)
    clf_base.fit(train_df[[i for i in train_df.columns if ((i not in ["Variation_old", "ID", "Class"]) & ("col" not in i))]]
            , train_df["Class"])
    sig_clf_base = CalibratedClassifierCV(clf_base, method="sigmoid")
    sig_clf_base.fit(train_df[[i for i in train_df.columns if ((i not in ["Variation_old", "ID", "Class"]) & ("col" not in i))]], train_df["Class"])
    pred_class_base = sig_clf_base.predict_proba(train_df[[i for i in train_df.columns if ((i not in ["Variation_old", "ID", "Class"]) & ("col" not in i))]])
    print("Log Loss :",log_loss(train_df["Class"], pred_class_base)) 
    pred_class_base_ = np.argmax(pred_class_base, axis=1)
    # plot_confusion_matrix(sig_clf, X_test, y_test)  

    pred_test_class_base = sig_clf_base.predict_proba(test_df[[i for i in train_df.columns if ((i not in ["Variation_old", "ID", "Class"]) & ("col" not in i))]])
    print("Log Loss :",log_loss(test_df["Class"], pred_test_class_base)) 
    pred_test_class_base_ = np.argmax(pred_test_class_base, axis=1)



In [None]:
plot_confusion_matrix(sig_clf_base, train_df[[i for i in train_df.columns if ((i not in ["Variation_old", "ID", "Class"]) & ("col" not in i))]]
                      , train_df["Class"])
# 30%

In [None]:
plot_confusion_matrix(sig_clf_base, test_df[[i for i in train_df.columns if ((i not in ["Variation_old", "ID", "Class"]) & ("col" not in i))]]
                      , test_df["Class"])
# 75.3%