In [None]:
from nlp_pipelines.dataset import Dataset
from nlp_pipelines.pipeline import Pipeline
import copy
from nlp_pipelines.evaluate import evaluate

In [None]:
# the abstracts are in a json file, which is a usual web serialization
abstracts = Dataset.from_json("./demo_data/springer-127-parsed.json", text_field="abstract", truth_field="keywords")

In [None]:
# top 10 true keywords
possible_labels = ["mental health", "depression", "schizophrenia",
 "covid-19", "suicide", "anxiety", "loneliness",
 "psychosis", "epidemiology", "mental disorders"]

# keep only keywords in this list, in a derived dataset from abstracts

abstracts_labels = copy.deepcopy(abstracts)

abstracts_labels.truths = [[item.lower() for item in sublist if item.lower() in possible_labels] for sublist in abstracts_labels.truths]

# test and train split, remove ones with now empty keyword lists too!
train, test = abstracts_labels.split(ratio=0.8, seed=101, labeled=True, splitLabeled=True)

print(train)
print(test)

train_3 = copy.deepcopy(train)
test_3 = copy.deepcopy(test)
train_4 = copy.deepcopy(train)
test_4 = copy.deepcopy(test)

# logistic regression pipeline

logistic_pipeline = Pipeline([
    {"name": "lemmatize", "method":"preprocess.Lemmatize"},
    {"name": "vectorize", "method":"vectorizer.SentenceEmbedding", "params":{"model_name":'all-MiniLM-L6-v2'}},
    {"name": "logistic", "method": "labeler.MultiLogistic"}
])

logistic_pipeline.set_data(train_data=train_3, run_data=test_3, possible_labels=possible_labels)

nn_pipeline = Pipeline([
    {"name": "lemmatize", "method":"preprocess.Lemmatize"},
    {"name": "vectorize", "method":"vectorizer.SentenceEmbedding", "params":{"model_name":'all-MiniLM-L6-v2'}},
    {"name": "logistic", "method": "labeler.SimpleNNLabeler", "params":{"threshold": 0.2}}
])

nn_pipeline.set_data(train_data=train_4, run_data=test_4, possible_labels=possible_labels)

logistic_pipeline.run()
nn_pipeline.run()

print("Logistic Regression Results")
print(evaluate(logistic_pipeline.run_data))

print("Neural Net Results")
print(evaluate(nn_pipeline.run_data))

print(nn_pipeline.run_data.results[0:3])
print(nn_pipeline.run_data.truths[0:3])