In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import argparse
import torch
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch import nn, optim
from data import HierDataModule
from data import infer_preprocess
from ERDE import ERDE_sample
from model import HierClassifier
from transformers import AutoTokenizer
import pytorch_lightning as pl
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from collections import defaultdict, Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from model import HierClassifier
from ERDE import ERDE_chunk
import xml.dom.minidom
import string
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

## baseline model

In [None]:
with open("processed/miniLM_L6_embs.pkl", "rb") as f:
    data = pickle.load(f)

train_posts = data["train_posts"]
train_mappings = data["train_mappings"]
train_tags = data["train_labels"]
train_embs = data["train_embs"]
test_posts = data["test_posts"]
test_mappings = data["test_mappings"]
test_tags = data["test_labels"]
test_embs = data["test_embs"]

In [None]:
with open("../Topic-Restrcted/baseline_outputs/tfidf.pkl", "rb") as f:
    tfidf_model = pickle.load(f)
with open("../Topic-Restrcted/baseline_outputs/lr.pkl", "rb") as f:
    lr = pickle.load(f)
with open("../Topic-Restrcted/baseline_outputs/lsvm.pkl", "rb") as f:
    clf = pickle.load(f)

In [None]:
texts_test = [" ".join([test_posts[i] for i in mappings]) for mappings in test_mappings]
X_test = tfidf_model.transform(texts_test)
X_test.shape

In [None]:
Y_test = np.array(test_tags)
Y_preds = lr.predict(X_test)
acc = accuracy_score(Y_test, Y_preds)
p = precision_score(Y_test, Y_preds)
r = recall_score(Y_test, Y_preds)
f1 = f1_score(Y_test, Y_preds)
print("Logistic Regression")
print(f"Acc: {acc:.5f}, P: {p:.5f}, R: {r:.5f}, F: {f1:.5f}")

In [None]:
Y_preds = clf.predict(X_test)
acc = accuracy_score(Y_test, Y_preds)
p = precision_score(Y_test, Y_preds)
r = recall_score(Y_test, Y_preds)
f1 = f1_score(Y_test, Y_preds)
print("Linear SVM")
print(f"Acc: {acc:.5f}, P: {p:.5f}, R: {r:.5f}, F: {f1:.5f}")

mental health filtered

In [None]:
with open("../Topic-Restrcted/baseline_outputs_all/tfidf.pkl", "rb") as f:
    tfidf_model = pickle.load(f)
with open("../Topic-Restrcted/baseline_outputs_all/lr.pkl", "rb") as f:
    lr = pickle.load(f)
with open("../Topic-Restrcted/baseline_outputs_all/lsvm.pkl", "rb") as f:
    clf = pickle.load(f)

In [None]:
texts_test = [" ".join([test_posts[i] for i in mappings]) for mappings in test_mappings]
X_test = tfidf_model.transform(texts_test)
X_test.shape

In [None]:
Y_test = np.array(test_tags)
Y_preds = lr.predict(X_test)
acc = accuracy_score(Y_test, Y_preds)
p = precision_score(Y_test, Y_preds)
r = recall_score(Y_test, Y_preds)
f1 = f1_score(Y_test, Y_preds)
print("Logistic Regression")
print(f"Acc: {acc:.5f}, P: {p:.5f}, R: {r:.5f}, F: {f1:.5f}")

In [None]:
Y_preds = clf.predict(X_test)
acc = accuracy_score(Y_test, Y_preds)
p = precision_score(Y_test, Y_preds)
r = recall_score(Y_test, Y_preds)
f1 = f1_score(Y_test, Y_preds)
print("Linear SVM")
print(f"Acc: {acc:.5f}, P: {p:.5f}, R: {r:.5f}, F: {f1:.5f}")

## deep model

In [None]:
def infer_texts(texts):
    batch = infer_preprocess(tokenizer, texts, max_len)
    for k, v in batch.items():
        batch[k] = v.cuda()
    with torch.no_grad():
        logits, attn_score = clf([batch])
    return torch.sigmoid(logits).detach().cpu().item(), attn_score[0].detach().cpu().numpy()

combined sim 16

In [None]:
clf = HierClassifier.load_from_checkpoint(
    "[pretrained ckpt at Topic-Restricted]"
)
clf.eval()
tokenizer = AutoTokenizer.from_pretrained(clf.model_type)
max_len = clf.hparams.max_len
max_posts = 16
clf.cuda()
None

In [None]:
input_dir = "./processed/combined_maxsim16/test"
Y_preds = []
Y_test = []
for fname in tqdm(os.listdir(input_dir)):
    posts = open(os.path.join(input_dir, fname), encoding="utf-8").read().strip().split("\n")
    label = float(fname[-5])
    proba, attn_score = infer_texts(posts)
    Y_test.append(label)
    Y_preds.append(float(proba >= 0.5))
Y_preds = np.array(Y_preds)
Y_test = np.array(Y_test)
acc = accuracy_score(Y_test, Y_preds)
p = precision_score(Y_test, Y_preds)
r = recall_score(Y_test, Y_preds)
f1 = f1_score(Y_test, Y_preds)
print(f"Acc: {acc:.5f}, P: {p:.5f}, R: {r:.5f}, F: {f1:.5f}")

In [None]:
clf = HierClassifier.load_from_checkpoint(
    "../Topic-Restricted/lightning_logs/version_9/checkpoints/epoch=1-step=659.ckpt"
)
clf.eval()
tokenizer = AutoTokenizer.from_pretrained(clf.model_type)
max_len = clf.hparams.max_len
max_posts = 16
clf.cuda()

In [None]:
input_dir = "./processed/combined_maxsim16/test"
Y_preds = []
Y_test = []
for fname in tqdm(os.listdir(input_dir)):
    posts = open(os.path.join(input_dir, fname), encoding="utf-8").read().strip().split("\n")
    label = float(fname[-5])
    proba, attn_score = infer_texts(posts)
    Y_test.append(label)
    Y_preds.append(float(proba >= 0.5))
Y_preds = np.array(Y_preds)
Y_test = np.array(Y_test)
acc = accuracy_score(Y_test, Y_preds)
p = precision_score(Y_test, Y_preds)
r = recall_score(Y_test, Y_preds)
f1 = f1_score(Y_test, Y_preds)
print(f"Acc: {acc:.5f}, P: {p:.5f}, R: {r:.5f}, F: {f1:.5f}")

In [None]:
clf = HierClassifier.load_from_checkpoint(
    "../Topic-Restricted/lightning_logs/version_16/checkpoints/epoch=3-step=1319.ckpt"
)
clf.eval()
tokenizer = AutoTokenizer.from_pretrained(clf.model_type)
max_len = clf.hparams.max_len
max_posts = 16
clf.cuda()
None

In [None]:
input_dir = "./processed/combined_maxsim16/test"
Y_preds = []
Y_test = []
for fname in tqdm(os.listdir(input_dir)):
    posts = open(os.path.join(input_dir, fname), encoding="utf-8").read().strip().split("\n")
    label = float(fname[-5])
    proba, attn_score = infer_texts(posts)
    Y_test.append(label)
    Y_preds.append(float(proba >= 0.5))
Y_preds = np.array(Y_preds)
Y_test = np.array(Y_test)
acc = accuracy_score(Y_test, Y_preds)
p = precision_score(Y_test, Y_preds)
r = recall_score(Y_test, Y_preds)
f1 = f1_score(Y_test, Y_preds)
print(f"Acc: {acc:.5f}, P: {p:.5f}, R: {r:.5f}, F: {f1:.5f}")

In [None]:
clf = HierClassifier.load_from_checkpoint(
    "../Topic-Restricted/lightning_logs/version_18/checkpoints/epoch=2-step=989.ckpt"
)
clf.eval()
tokenizer = AutoTokenizer.from_pretrained(clf.model_type)
max_len = clf.hparams.max_len
max_posts = 16
clf.cuda()
None

In [None]:
input_dir = "./processed/combined_maxsim16/test"
Y_preds = []
Y_test = []
for fname in tqdm(os.listdir(input_dir)):
    posts = open(os.path.join(input_dir, fname), encoding="utf-8").read().strip().split("\n")
    label = float(fname[-5])
    proba, attn_score = infer_texts(posts)
    Y_test.append(label)
    Y_preds.append(float(proba >= 0.5))
Y_preds = np.array(Y_preds)
Y_test = np.array(Y_test)
acc = accuracy_score(Y_test, Y_preds)
p = precision_score(Y_test, Y_preds)
r = recall_score(Y_test, Y_preds)
f1 = f1_score(Y_test, Y_preds)
print(f"Acc: {acc:.5f}, P: {p:.5f}, R: {r:.5f}, F: {f1:.5f}")

depress 16

In [None]:
clf = HierClassifier.load_from_checkpoint(
    "../Topic-Restricted/lightning_logs/version_11/checkpoints/epoch=2-step=989.ckpt"
)
clf.eval()
tokenizer = AutoTokenizer.from_pretrained(clf.model_type)
max_len = clf.hparams.max_len
max_posts = 16
clf.cuda()
None

In [None]:
input_dir = "./processed/depress_sim16/test"
Y_preds = []
Y_test = []
for fname in tqdm(os.listdir(input_dir)):
    posts = open(os.path.join(input_dir, fname), encoding="utf-8").read().strip().split("\n")
    label = float(fname[-5])
    proba, attn_score = infer_texts(posts)
    Y_test.append(label)
    Y_preds.append(float(proba >= 0.5))
Y_preds = np.array(Y_preds)
Y_test = np.array(Y_test)
acc = accuracy_score(Y_test, Y_preds)
p = precision_score(Y_test, Y_preds)
r = recall_score(Y_test, Y_preds)
f1 = f1_score(Y_test, Y_preds)
print(f"Acc: {acc:.5f}, P: {p:.5f}, R: {r:.5f}, F: {f1:.5f}")

In [None]:
clf = HierClassifier.load_from_checkpoint(
    "../Topic-Restricted/lightning_logs/version_17/checkpoints/epoch=3-step=1319.ckpt"
)
clf.eval()
tokenizer = AutoTokenizer.from_pretrained(clf.model_type)
max_len = clf.hparams.max_len
max_posts = 16
clf.cuda()
None

In [None]:
input_dir = "./processed/depress_sim16/test"
Y_preds = []
Y_test = []
for fname in tqdm(os.listdir(input_dir)):
    posts = open(os.path.join(input_dir, fname), encoding="utf-8").read().strip().split("\n")
    label = float(fname[-5])
    proba, attn_score = infer_texts(posts)
    Y_test.append(label)
    Y_preds.append(float(proba >= 0.5))
Y_preds = np.array(Y_preds)
Y_test = np.array(Y_test)
acc = accuracy_score(Y_test, Y_preds)
p = precision_score(Y_test, Y_preds)
r = recall_score(Y_test, Y_preds)
f1 = f1_score(Y_test, Y_preds)
print(f"Acc: {acc:.5f}, P: {p:.5f}, R: {r:.5f}, F: {f1:.5f}")

depress 64

In [None]:
clf = HierClassifier.load_from_checkpoint(
    "../Topic-Restricted/lightning_logs/version_15/checkpoints/epoch=3-step=1319.ckpt"
)
clf.eval()
tokenizer = AutoTokenizer.from_pretrained(clf.model_type)
max_len = clf.hparams.max_len
max_posts = 16
clf.cuda()
None

In [None]:
input_dir = "./processed/depress_sim16/test"
Y_preds = []
Y_test = []
for fname in tqdm(os.listdir(input_dir)):
    posts = open(os.path.join(input_dir, fname), encoding="utf-8").read().strip().split("\n")
    label = float(fname[-5])
    proba, attn_score = infer_texts(posts)
    Y_test.append(label)
    Y_preds.append(float(proba >= 0.5))
Y_preds = np.array(Y_preds)
Y_test = np.array(Y_test)
acc = accuracy_score(Y_test, Y_preds)
p = precision_score(Y_test, Y_preds)
r = recall_score(Y_test, Y_preds)
f1 = f1_score(Y_test, Y_preds)
print(f"Acc: {acc:.5f}, P: {p:.5f}, R: {r:.5f}, F: {f1:.5f}")