Installing necessary modules and run necessary scripts

In [17]:
from torch import Tensor
!!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

['Looking in indexes: https://download.pytorch.org/whl/cu121',
 '',
 '[notice] A new release of pip is available: 23.2.1 -> 23.3.1',
 '[notice] To update, run: python.exe -m pip install --upgrade pip']

In [18]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [19]:
!! pip install spacy
!! python -m spacy download en_core_web_sm
!!pip install sentence-transformers

 '',
 '[notice] A new release of pip is available: 23.2.1 -> 23.3.1',
 '[notice] To update, run: python.exe -m pip install --upgrade pip']

Importing modules

In [20]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex
spacy.prefer_gpu()

# Load the language model
nlp = spacy.load('en_core_web_sm')

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens', device=device)
from sentence_transformers import SentenceTransformer, util
from typing import List
from transformers import BertTokenizer

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [21]:
from collections import namedtuple
CandidateMWE = namedtuple('CandidateMWE',['text','head', 'sentence','self_encode', 'sent_encode'])
CandidateW=namedtuple('CandidateW',['text','lemma', 'self_encode' ])
# Term=namedtuple('Term',['text','detected' ])

creating functions

In [22]:
def parse_candidates(text:str):   
    # Process the text with spaCy
    doc = nlp(text)
    
    # Extract MWEs (noun phrases) from the text
    mwe_list = []
    single_noun_list=dict()
    candidate_list=[]
    for sent in doc.sents:
        sent_encode=model.encode(sent.text, convert_to_tensor=True).to(device)
        for chunk in doc.noun_chunks:
            is_candidate=False
            word_count=0
            # mwe_list.append(chunk.text)
            if len(chunk.text.split()) > 1:
                noun_appeared=False
                is_candidate=True
                cleared_candidate=''
                for word in chunk:
                    #IGNORING
                    if word.pos_ in ['PUNCT', 'DET']:
                        continue
                    elif word.pos_ not in ['ADJ', 'PROPN', 'NOUN']:
                        is_candidate=False
                        # print(f'{chunk.text} is not candidate 1 {word.text} -- {word.pos_}')
                        break
                    elif word.pos_ in ['PROPN', 'NOUN']:
                        noun_appeared=True
                    elif not(not noun_appeared and word.pos_=='ADJ'):
                        is_candidate=False
                        # print(f'{chunk.text} is not candidate 2 {word.text} -- {word.pos_}, {noun_appeared}')
                        break
                    cleared_candidate+=' '+word.text
                    word_count+=1
            if is_candidate and word_count>1:
                cleared_candidate=cleared_candidate.strip()
                candidate_list.append(CandidateMWE(cleared_candidate, chunk.root.text, sent.text, model.encode(cleared_candidate, convert_to_tensor=True).to(device), sent_encode))
                # print(f'Added candicate expression: {cleared_candidate}')
            else:
                
                # print(f'Added candidate words from : {chunk.text}')    
                for word in chunk:
                    if word.pos_ in ['NOUN', 'PROPN']:
                        single_noun_list[word.text]=  CandidateW(word.text, word.lemma_, model.encode(word.text))
                        # print(word.text)    
    return candidate_list, single_noun_list.values()

In [23]:
def dist(wi_encode, wj_encode):
    return util.pytorch_cos_sim(
        wi_encode,
        wj_encode
    )
def calculate_topic_score(expression_embedding, sentence_embedding)->float:
    """
    Calculate the topic score between a multiword expression and a sentence.

    Args:
        multiword_expression (str): The multiword expression.
        sentence (str): The sentence containing the expression.

    Returns:
        float: The topic score (cosine similarity) between the two embeddings.
    """
    # Load the distilbert-base-nli-mean-tokens model

    # Encode the multiword expression and sentence into embeddings
    # expression_embedding = model.encode(multiword_expression, convert_to_tensor=True)
    # sentence_embedding = model.encode(sentence, convert_to_tensor=True)

    # Calculate cosine similarity between the two embeddings
    similarity_score = util.pytorch_cos_sim(expression_embedding, sentence_embedding)

    # Extract the cosine similarity value from the tensor
    topic_score = similarity_score[0].item()

    return topic_score


def calculate_specificity_score(mw:CandidateMWE, full_encode:Tensor)->float:
    """
    Calculate the specificity score (SP) between a multiword expression (mw) and a list of words/multiword expressions (w).

    Args:
        mw (str): The multiword expression.
        w (list of str): The list of words/multiword expressions in the context.

    Returns:
        float: The specificity score (SP).
    """
    # Load the distilbert-base-nli-mean-tokens model
    # Calculate distances between mw and each word/phrase in w
    distances = dist(mw.self_encode,full_encode)

    # Calculate the mean of the distances
    specificity_score = distances.mean().item()
    

    return specificity_score


In [24]:
def detect_mw_terms(candidate_list:List[CandidateMWE], TSP:float = 0.05, Ttopic:float = 0.1)->List[CandidateMWE]:    
            
    full_encode= torch.stack([wi.self_encode for wi in candidate_list], dim=0)    
    
    temp_candidate = []
    for candidate in candidate_list:
        topic_score = calculate_topic_score(candidate.self_encode, candidate.sent_encode)
        sp_score=calculate_specificity_score(candidate, full_encode)
        if topic_score > Ttopic and sp_score > TSP:
            temp_candidate.append(candidate)
            # print(f'Added "{candidate.text}" topic score {topic_score}, specifity score {sp_score}')

    return temp_candidate

In [25]:
def detect_single_noun_terms(term_mws:List[CandidateMWE], single_noun_list:List[CandidateW], subtoken_threshold:int=4)->List[CandidateW]:    
    term_nouns=[]
    for candidate in single_noun_list:
        #Check if the lemma of the noun is the same as any of the heads of the multiword expressions.
        is_term=False
        lemma_is_head=False
        for term_mw in term_mws:
            if term_mw.head==candidate.lemma:
                is_term=True
                term_nouns.append(candidate)
                # print(f'"{candidate.text}" is added by lemma: "{candidate.lemma}" is head of "{term_mw}"')
                break
        if is_term:
            continue
        #segment the word using a subword-unit segmentation and a vocabulary trained over a large general purpose corpus.
        subtokens=tokenizer.tokenize(candidate.text)
        if len(subtokens)>subtoken_threshold:
            term_nouns.append(candidate)
            # print(f'{candidate.text} is added by subtokens count: {len(subtokens)}')
    return term_nouns

Calculate metrics(from https://colab.research.google.com/drive/1y9WM3MSAEwvODhMt0cMwJi24XIllrmaY?usp=sharing) 

In [26]:
def calculate_metrics(true_terms, extracted_terms):
    true_positives = len(true_terms.intersection(extracted_terms))
    false_positives = len(extracted_terms.difference(true_terms))
    false_negatives = len(true_terms.difference(extracted_terms))

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return precision, recall, f1_score

def compare_sets(extracted_terms, true_terms):
    true_detections = extracted_terms.intersection(true_terms)
    false_gaps = true_terms.difference(extracted_terms)
    false_detections = extracted_terms.difference(true_terms)

    return true_detections, false_gaps, false_detections

In [27]:
def report(true_terms, extracted_terms):
    # Расчет метрик
    precision, recall, f1_score = calculate_metrics(true_terms, extracted_terms)
    
    print("Precision:", precision) #Точность
    print("Recall:", recall) #Полнота
    print("F1 Score:", f1_score)
    # Выводит списки истинных обнаружений, ложных пропуски и ложных обнаружений
    true_detections, false_gaps, false_detections = compare_sets(extracted_terms, true_terms)
    
    print("Истинные обнаружения:", true_detections)
    print("Ложные пропуски:", false_gaps)
    print("Ложные обнаружения:", false_detections)
    return precision, recall, f1_score, true_detections, false_gaps, false_detections 

Testing in Folder

In [28]:
import os

folder_name='tests'
# Get a list of all files in the folder
all_files = os.listdir(folder_name)

# Initialize empty lists to store the tuples
file_tuples = []

# Iterate through the files in the folder
for filename in all_files:
    # Check if the file is a test file (starts with "text_")
    if filename.startswith("text_"):
        # Construct the expected result filename by replacing "text_" with "term_"
        result_filename = "term_" + filename[5:]
        
        # Check if the corresponding result file exists in the folder
        if result_filename in all_files:
            # Append the tuple to the list
            file_tuples.append((filename, result_filename))

In [29]:
file_tuples

[('text_1.txt', 'term_1.txt'), ('text_2.txt', 'term_2.txt')]

In [30]:
def test_file(text_file_name, term_file_name):
    print('____________________________________________________________________\n')
    print(f"checking files: {text_file_name}, {term_file_name}")    
    text=''
    with open(text_file_name) as text_file:
        text=text_file.read()
    print('_____________________________FIRST STEP_________________________________________')
    candidate_list, single_noun_list=parse_candidates(text)
    print('_____________________________SECOND STEP___________________________________________________')
    term_mws=detect_mw_terms(candidate_list)
    print('_____________________________THIRD STEP______________________________________________________________')
    term_nouns=detect_single_noun_terms(term_mws, single_noun_list)
    print('____________________________________TESTING__________________________________________________________________')
    extracted_terms=set([txt.text.lower() for txt in term_mws+term_nouns])
    with open(term_file_name) as term_file:
        terms_str=term_file.read()
    true_terms=set(terms_str.lower().split(', '))

    precision, recall, f1_score, true_detections, false_gaps, false_detections = report(true_terms, extracted_terms)
    return true_terms, extracted_terms, precision, recall, f1_score, true_detections, false_gaps, false_detections 
 


In [31]:
os.chdir(folder_name)
whole_true_terms=[]
whole_extracted_terms=[]
results=[]
for text_file_name, term_file_name in file_tuples:
    true_terms, extracted_terms, precision, recall, f1_score, true_detections, false_gaps, false_detections=test_file(text_file_name, term_file_name)
    results.append((true_terms, extracted_terms, precision, recall, f1_score, true_detections, false_gaps, false_detections ))
    print('____________________________________________________________________\n')
    whole_true_terms+=true_terms
    whole_extracted_terms=extracted_terms
print('___________________________TOTAL_________________________________________\n')
whole_true_terms=set(whole_true_terms)
whole_extracted_terms=set(whole_extracted_terms)
print('total result')
report(whole_true_terms, whole_extracted_terms)

____________________________________________________________________

checking files: text_1.txt, term_1.txt
_____________________________FIRST STEP_________________________________________
_____________________________SECOND STEP___________________________________________________
_____________________________THIRD STEP______________________________________________________________
____________________________________TESTING__________________________________________________________________
Precision: 0.2619047619047619
Recall: 0.2894736842105263
F1 Score: 0.2750000000000001
Истинные обнаружения: {'slush money', 'development aid', 'general assembly', 'transparency international', 'officials', 'criminal law', 'money laundering', 'business community', 'prosecution', 'tax legislation', 'international market', 'criminal conspiracy', 'fighting corruption', 'third party', 'legal action', 'bribery', 'commercial dispute', 'legal persons', 'subsidy laws', 'private bribery', 'public contracts', 'm

(0.3611111111111111,
 0.04153354632587859,
 0.07449856733524356,
 {'blood pressure',
  'hazard ratio',
  'heart failure',
  'heart rate',
  'hospitalization',
  'left ventricular ejection fraction',
  'neurohumoral blockers',
  'ras blockers',
  'renal function',
  'spironolactone',
  'uptitration',
  'uptitrations',
  'î²-blockers'},
 {'accounting irregularities',
  'active',
  'active corruption',
  'additional tenths',
  'admission',
  'advantage',
  'advantages',
  'aggravating circumstances',
  'agreement on government procurement',
  'all-cause mortality',
  'ambulatory',
  'american',
  'anti-bribery convention',
  'anti-corruption',
  'anti-corruption policy',
  'anti-fraud',
  'assets',
  'auditing',
  'authorities',
  'basis of tax assessment',
  'belgian criminal code',
  'belgium',
  'beta-blocker',
  'blacklist',
  'blacklists',
  'blackmail',
  'board of directors',
  'board of public prosecutors-general',
  'bribe',
  'bribery',
  'bribing',
  'bureau of official ethics 

In [32]:
os.chdir('..')
