codelab link: https://colab.research.google.com/github/abzzall/unsupervised_annotator1/blob/main/main.ipynb
github link: https://github.com/abzzall/unsupervised_annotator1.git

1. Extract multiword expression candidates. 
	1. Using the part-of-speech tags we extract multiword expression candidates, consisting of sequences of zero or more adjectives (ADJ followed by nouns (NOUN) or proper nouns (PROPNs) sequences.
	2. To generate training data for sequence tagging use sentence encoder like 
		a. EmbedRank (Bennani- Smires et al., 2018
		b. Key2Vec (Mahata et al., 2018)
    3. We implemented our Unsupervised Annotator using the POS tagger of SpaCy (Honnibal et al., 2020).

In [1]:
import re
!! pip install spacy 



In [2]:
!! python -m spacy download en_core_web_sm
!!pip install sentence-transformers




In [3]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex
# Load the language model
nlp = spacy.load('en_core_web_sm')

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
from sentence_transformers import SentenceTransformer, util
from typing import List

In [4]:

# Define your text
with open('abstracts.txt') as file:
    text=file.read()


In [5]:
# text = "I love New York and San Francisco. Los Angeles is another great city."


In [6]:
from collections import namedtuple
CandidateMWE = namedtuple('CandidateMWE',['text','head', 'sentence','self_encode', 'sent_encode'])
CandidateW=namedtuple('CandidateW',['text','lemma', 'self_encode' ])
Term=namedtuple('Term',['text','detected' ])

In [7]:
# Process the text with spaCy
doc = nlp(text)

# Extract MWEs (noun phrases) from the text
mwe_list = []
single_noun_list=[]
candidate_list=[]
for sent in doc.sents:
    sent_encode=model.encode(sent.text)
    for chunk in doc.noun_chunks:
        is_candidate=False
        word_count=0
        # mwe_list.append(chunk.text)
        if len(chunk.text.split()) > 1:
            noun_appeared=False
            is_candidate=True
            cleared_candidate=''
            for word in chunk:
                #IGNORING
                if word.pos_ in ['PUNCT', 'DET']:
                    continue
                elif word.pos_ not in ['ADJ', 'PROPN', 'NOUN']:
                    is_candidate=False
                    # print(f'{chunk.text} is not candidate 1 {word.text} -- {word.pos_}')
                    break
                elif word.pos_ in ['PROPN', 'NOUN']:
                    noun_appeared=True
                elif not(not noun_appeared and word.pos_=='ADJ'):
                    is_candidate=False
                    print(f'{chunk.text} is not candidate 2 {word.text} -- {word.pos_}, {noun_appeared}')
                    break
                cleared_candidate+=' '+word.text
                word_count+=1
        if is_candidate and word_count>1:
            cleared_candidate.strip()
            candidate_list.append(CandidateMWE(cleared_candidate, chunk.root.text, sent.text, model.encode(cleared_candidate), sent_encode))
            # print(cleared_candidate)
        else:
            single_noun_list +=[CandidateW(word.text, word.lemma_, model.encode(word.text)) for word in chunk if word.pos_ in ['NOUN', 'PROPN']]
                    
mwe_list=candidate_list+single_noun_list
# print(mwe_list)


context aware expectation maximization is not candidate 2 aware -- ADJ, True
label-rich source data is not candidate 2 rich -- ADJ, True
image-level weak labels is not candidate 2 weak -- ADJ, True
category-wise domain alignment is not candidate 2 wise -- ADJ, True
context aware expectation maximization is not candidate 2 aware -- ADJ, True
label-rich source data is not candidate 2 rich -- ADJ, True
image-level weak labels is not candidate 2 weak -- ADJ, True
category-wise domain alignment is not candidate 2 wise -- ADJ, True
context aware expectation maximization is not candidate 2 aware -- ADJ, True
label-rich source data is not candidate 2 rich -- ADJ, True
image-level weak labels is not candidate 2 weak -- ADJ, True
category-wise domain alignment is not candidate 2 wise -- ADJ, True
context aware expectation maximization is not candidate 2 aware -- ADJ, True
label-rich source data is not candidate 2 rich -- ADJ, True
image-level weak labels is not candidate 2 weak -- ADJ, True
cate

In [8]:
candidate_list[0].self_encode
    
    

array([-9.63636696e-01, -4.82337564e-01, -2.30158530e-02,  7.74058849e-02,
       -1.63760021e-01, -3.60837460e-01,  4.37124789e-01, -8.23607624e-01,
       -3.36114585e-01,  6.52776599e-01,  5.74865699e-01,  4.84254926e-01,
        4.71410275e-01, -1.10581350e+00,  2.74779230e-01, -2.07976341e-01,
        1.38057208e+00,  2.43439525e-01, -1.04358041e+00, -5.37778914e-01,
       -3.57596308e-01, -2.67703086e-01, -1.13106549e-01,  7.80642182e-02,
       -1.02981734e+00,  1.02077913e+00,  6.20207191e-01, -4.54958603e-02,
        3.62546146e-01, -9.64263529e-02, -1.06257915e-01,  7.59985566e-01,
        4.87566322e-01, -5.96640289e-01,  9.43193913e-01,  7.98237920e-02,
        2.52023578e-01,  5.40241718e-01, -2.27881268e-01, -1.17047536e+00,
        4.21555132e-01,  3.46142650e-01, -1.84256867e-01, -8.23570609e-01,
       -9.60549042e-02,  2.16798726e-02, -7.39248872e-01,  6.24549031e-01,
       -1.02267861e+00, -5.02260327e-01, -2.98217386e-01,  1.03133261e+00,
        1.37354052e+00,  

In [28]:
def dist(wi_encode, wj_encode)->float:
    return util.pytorch_cos_sim(
        wi_encode,
        wj_encode
    )


def calculate_topic_score(expression_embedding, sentence_embedding)->float:
    """
    Calculate the topic score between a multiword expression and a sentence.

    Args:
        multiword_expression (str): The multiword expression.
        sentence (str): The sentence containing the expression.

    Returns:
        float: The topic score (cosine similarity) between the two embeddings.
    """
    # Load the distilbert-base-nli-mean-tokens model

    # Encode the multiword expression and sentence into embeddings
    # expression_embedding = model.encode(multiword_expression, convert_to_tensor=True)
    # sentence_embedding = model.encode(sentence, convert_to_tensor=True)

    # Calculate cosine similarity between the two embeddings
    similarity_score = util.pytorch_cos_sim(expression_embedding, sentence_embedding)

    # Extract the cosine similarity value from the tensor
    topic_score = similarity_score[0].item()

    return topic_score


def calculate_specificity_score(mw:CandidateMWE, w:List[CandidateW|CandidateMWE])->float:
    """
    Calculate the specificity score (SP) between a multiword expression (mw) and a list of words/multiword expressions (w).

    Args:
        mw (str): The multiword expression.
        w (list of str): The list of words/multiword expressions in the context.

    Returns:
        float: The specificity score (SP).
    """
    # Load the distilbert-base-nli-mean-tokens model
    # Calculate distances between mw and each word/phrase in w
    distances = [dist(mw.self_encode, wi.self_encode) for wi in w if wi.text != mw.text]

    # Calculate the mean of the distances
    specificity_score = sum(distances) / len(w)

    return specificity_score

def calculate_specifity_score_context_itself(mw:CandidateMWE):
    candidate_embedding=mw.self_encode
    s=0
    for word in mw.text.split():
        word_embedding=model.encode(word, convert_to_tensor=True)
        distance=dist(candidate_embedding, word_embedding)
        s+=distance
    return s/len(mw.text.split())

def calculate_specifity_score_context_sentence(mw:CandidateMWE):
    candidate_embedding=mw.self_encode
    s=0
    c=0
    sentence=mw.sentence
    doc=nlp(sentence)
    for word in doc.noun_chunks:
        word_embedding=model.encode(word.text, convert_to_tensor=True)
        distance=dist(candidate_embedding, word_embedding)
        s+=distance
        c+=1
    return s/c




In [29]:
TSP = 0.05
Ttopic = 0.1

In [30]:
from datetime import datetime

First Variant

In [12]:
term_mws1=[]

In [13]:
start_time=datetime.now()
for candidate in candidate_list:
    topic_score=calculate_topic_score(candidate.self_encode, candidate.sent_encode)
    sp_score=calculate_specificity_score(candidate, mwe_list)
    if topic_score>Ttopic and sp_score>TSP:
        term_mws1.append(Term(candidate.text, 'by_score'))
        print(candidate.text, topic_score, sp_score)
end_time=datetime.now()
delta=end_time-start_time
print(f'duration: {delta.seconds}')

 low shot tasks 0.5353471636772156 tensor([[0.5992]])
 low shot learning 0.6060268878936768 tensor([[0.5303]])
 richer supervision 0.256664901971817 tensor([[0.5244]])
 annotator rationales 0.20491208136081696 tensor([[0.6847]])
 label annotations 0.16839924454689026 tensor([[0.6595]])
 low shot text classification 0.4988535940647125 tensor([[0.5295]])
 simple bag 0.3140365183353424 tensor([[0.5702]])
 BERT model 0.12237536907196045 tensor([[0.5466]])
 substantial performance gains 0.25737330317497253 tensor([[0.5767]])
 clear top performer 0.1776266098022461 tensor([[0.4290]])
 more complex models 0.35485905408859253 tensor([[0.5394]])
 more training data 0.35459399223327637 tensor([[0.4961]])
 Current dialogue summarization systems 0.2164538949728012 tensor([[0.6222]])
 general semantic features 0.2545032203197479 tensor([[0.6361]])
 open domain toolkits 0.15248295664787292 tensor([[0.4972]])
 human annotations 0.23864704370498657 tensor([[0.6514]])
 conversational response generatio

Second Variant. Filtering out by topic score first

In [20]:
term_mws2=[]

In [21]:
start_time=datetime.now()

temp_candidate=[]
for candidate in candidate_list:
    topic_score=calculate_topic_score(candidate.self_encode, candidate.sent_encode)
    # sp_score=calculate_specificity_score(candidate, mwe_list)
    if topic_score>Ttopic:
        temp_candidate.append(candidate)
for candidate in temp_candidate:
    # topic_score=calculate_topic_score(candidate.self_encode, candidate.sent_encode)
    sp_score=calculate_specificity_score(candidate, temp_candidate)
    if sp_score>TSP:
        term_mws2.append(Term(candidate.text, 'by_score'))        
        print(candidate.text, sp_score)
end_time=datetime.now()
delta=end_time-start_time
print(f'duration: {delta.seconds}')

 low shot tasks tensor([[0.5610]])
 low shot learning tensor([[0.5192]])
 richer supervision tensor([[0.4985]])
 annotator rationales tensor([[0.6392]])
 label annotations tensor([[0.6065]])
 low shot text classification tensor([[0.5165]])
 simple bag tensor([[0.5248]])
 BERT model tensor([[0.4977]])
 substantial performance gains tensor([[0.5357]])
 clear top performer tensor([[0.4173]])
 more complex models tensor([[0.5216]])
 more training data tensor([[0.4830]])
 Current dialogue summarization systems tensor([[0.5824]])
 general semantic features tensor([[0.6065]])
 open domain toolkits tensor([[0.4890]])
 human annotations tensor([[0.6040]])
 conversational response generation tensor([[0.5800]])
 unsupervised dialogue annotator tensor([[0.5446]])
 dialogue background knowledge tensor([[0.6391]])
 Experimental results tensor([[0.5899]])
 remarkable improvements tensor([[0.5418]])
 consistent ground truth data tensor([[0.5034]])
 interest points tensor([[0.6350]])
 natural images te


KeyboardInterrupt



Third Variant: Context is itself

In [31]:
term_mws3=[]

In [32]:
start_time=datetime.now()

for candidate in candidate_list:
    topic_score=calculate_topic_score(candidate.self_encode, candidate.sent_encode)
    sp_score=calculate_specifity_score_context_itself(candidate)
    if topic_score>Ttopic and sp_score>TSP:
        term_mws3.append(Term(candidate.text, 'by_score'))
        print(candidate.text, topic_score, sp_score)
end_time=datetime.now()
delta=end_time-start_time
print(f'duration: {delta.seconds}')

 low shot tasks 0.5353471636772156 tensor([[0.7478]])
 low shot learning 0.6060268878936768 tensor([[0.6851]])
 richer supervision 0.256664901971817 tensor([[0.7765]])
 annotator rationales 0.20491208136081696 tensor([[0.8888]])
 label annotations 0.16839924454689026 tensor([[0.8769]])
 low shot text classification 0.4988535940647125 tensor([[0.6417]])
 simple bag 0.3140365183353424 tensor([[0.8375]])
 BERT model 0.12237536907196045 tensor([[0.7785]])
 substantial performance gains 0.25737330317497253 tensor([[0.8027]])
 clear top performer 0.1776266098022461 tensor([[0.6693]])
 more complex models 0.35485905408859253 tensor([[0.6772]])
 more training data 0.35459399223327637 tensor([[0.6281]])
 Current dialogue summarization systems 0.2164538949728012 tensor([[0.7574]])
 general semantic features 0.2545032203197479 tensor([[0.7528]])
 open domain toolkits 0.15248295664787292 tensor([[0.6469]])
 human annotations 0.23864704370498657 tensor([[0.8550]])
 conversational response generatio

Forth Variant: Context is sentence

In [33]:
term_mws3 = []


In [34]:
start_time=datetime.now()

for candidate in candidate_list:
	topic_score = calculate_topic_score(candidate.self_encode, candidate.sent_encode)
	sp_score = calculate_specifity_score_context_sentence(candidate)
	if topic_score > Ttopic and sp_score > TSP:
		term_mws3.append(Term(candidate.text, 'by_score'))
		print(candidate.text, topic_score, sp_score)
end_time=datetime.now()
delta=end_time-start_time
print(f'duration: {delta.seconds}')

 low shot tasks 0.5353471636772156 tensor([[0.7215]])
 low shot learning 0.6060268878936768 tensor([[0.6458]])
 richer supervision 0.256664901971817 tensor([[0.5146]])
 annotator rationales 0.20491208136081696 tensor([[0.6160]])
 label annotations 0.16839924454689026 tensor([[0.5910]])
 low shot text classification 0.4988535940647125 tensor([[0.5966]])
 simple bag 0.3140365183353424 tensor([[0.6413]])
 BERT model 0.12237536907196045 tensor([[0.4834]])
 substantial performance gains 0.25737330317497253 tensor([[0.5478]])
 clear top performer 0.1776266098022461 tensor([[0.3997]])
 more complex models 0.35485905408859253 tensor([[0.4874]])
 more training data 0.35459399223327637 tensor([[0.5327]])
 Current dialogue summarization systems 0.2164538949728012 tensor([[0.5825]])
 general semantic features 0.2545032203197479 tensor([[0.6294]])
 open domain toolkits 0.15248295664787292 tensor([[0.4428]])
 human annotations 0.23864704370498657 tensor([[0.6332]])
 conversational response generatio

3. Upgrade single nouns according to morphological features.
	1. At this stage, we could have nouns that are not part of any multiword expressions, but still relevant.
	2. Check if the lemma of the noun is the same as any of the heads of the multiword expressions.
		a. Yes: we upgrade the noun to term 
		b. No: segment the word using a subword-unit segmentation and a vocabulary trained over a large general purpose corpus.
we use the vocabulary of the BERT-base model from HuggingFace (Wolf et al., 2020) and the corresponding tokenizer.

In [None]:
from transformers import BertTokenizer

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
term_nouns=[]
subtoken_threshold=4
for candidate in single_noun_list:
    #Check if the lemma of the noun is the same as any of the heads of the multiword expressions.
    is_term=False
    lemma_is_head=False
    for term_mw in term_mws:
        if term_mw.head==candidate.lemma:
            is_term=True
            break
    if is_term:
        term_nouns.append(Term(candidate.text, 'by_lemma'))
        continue
    #segment the word using a subword-unit segmentation and a vocabulary trained over a large general purpose corpus.
    subtokens=tokenizer.tokenize(candidate.text)
    if len(subtokens)>subtoken_threshold:
        term_nouns.append(Term(candidate.text, 'by_subtokens'))

In [None]:
terms=term_mws+term_nouns
with open('out.txt', 'w') as out_file:
    for term in terms:
        out_file.write(f'"{term.text}" appended {term.detected}\n')
        print(f'"{term.text}" appended {term.detected}\n')
        