# Identifying and classifying v-safe free-text responses related to menstruation or vaginal bleeding after COVID-19 vaccination

This notebook contains code for data pre-processing, filtering by search strings, and zero-shot classification. Zero-shot classification code is adapted from https://huggingface.co/facebook/bart-large-mnli.

In [None]:
from transformers import pipeline
import torch
import numpy as np
import pandas as pd
import pickle
import ast
import re
from tqdm import tqdm
import csv
from ftfy import fix_encoding

## Import data

In [None]:
chk = pd.read_csv("chkin_0109.csv")
chk["sx_txt"] = chk["sx_txt"].apply(lambda x: fix_encoding(x)).apply(lambda x: x.strip())

## Filter text by search strings

In [None]:
crit1 = r'menses|menst|spotting|period(s\b|\b)|\bcycle|\bmiscarr|menorrh|metrorrh'
crit2 = r'bleed|blood'
crit3 = r'menop|\bpreg|\buter|hyster|placent|\bvag|breakthrough|(break through)|endomet|\bgest|term\b|trimester'

In [None]:
kwfilter = chk["sx_txt"].apply(lambda x: x.lower()).apply(lambda x: bool(re.search(crit1, x)) or (bool(re.search(crit2, x) and bool(re.search(crit3, x)))))

In [None]:
mens_txt = chk[kwfilter].reset_index()

## Zero-shot classification

In [None]:
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', device=0)

In [None]:
topics = ['This mentions a period that came early', 
          'This mentions menstruation', 
          'This mentions a period that came late', 
          'This mentions spotting', 
          'This mentions a heavy menstrual period',
          'This mentions vaginal bleeding', 
          'This mentions uterine bleeding', 
          'This mentions a painful menstrual period', 
          'This mentions prolonged bleeding specifically', 
          'This mentions an irregular period specifically', 
          'This specifically mentions missing or skipping a period', 
          'This mentions not having a period for years']

core_top = ['This mentions menstruation',
 'This mentions spotting',
 'This mentions vaginal bleeding',
 'This mentions uterine bleeding',
 'This specifically mentions missing or skipping a period']

timing_top = ['This mentions a period that came early',
 'This mentions a period that came late',
 'This mentions spotting',
 'This mentions an irregular period specifically',
 'This specifically mentions missing or skipping a period']

severe_top = ['This mentions a heavy menstrual period',
 'This mentions a painful menstrual period',
 'This mentions prolonged bleeding specifically']

yrs_top = ['This mentions not having a period for years']

In [None]:
res = classifier(mens_txt["sx_txt"], topics, multi_label=True)