In [None]:
import pandas as pd
import ast
import torch
from torch import nn as nn
from transformers import DistilBertTokenizerFast
import logging
import contractions


from tqdm import tqdm
import re

logger = logging.getLogger(__name__)
import onnxruntime as ort

MAX_LEN = 128

model_name = "./distilbert-base-multilingual-cased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
onnx_model_path = "./multilingual_model_onnx.onnx"
session = ort.InferenceSession(onnx_model_path)

In [3]:
id2ner_label = {
            0:'O',
            1:'PER',
            2:'LOC',
            3:'ORG',
            4:'MISC'
            }
        
id2bioes_label = {
            0:'O',
            1:'B',
            2:'I',
            3:'E',
            4:'S'
            
        }
id2sentiment_label = {
            0:'negative',
            1:'neutral',
            2:'positive'
        }

In [None]:
with open("stopwords.txt", "r") as f:
    stop_words = [line.strip() for line in f]

In [5]:
def text_preprocessing(text):
    text = text.lower()
    words = text.split()
    text = " ".join([contractions.fix(i) for i in words])
    text = " ".join([i for i in text.split() if i not in stop_words])
    return text

In [6]:
def predictions(text):
    inputs = tokenizer(text, return_tensors="np", truncation=True, padding=True, max_length=128)

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    onnx_inputs = {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }

    outputs = session.run(
        ["topic_logits", "ner_logits", "sentiment_logits"],  
        onnx_inputs  
    )

    topic_predictions = torch.argmax(torch.tensor(outputs[0]), dim=-1).squeeze().to(device).tolist()
    ner_predictions = torch.argmax(torch.tensor(outputs[1]), dim=-1).squeeze().to(device).tolist()
    sentiment_prediction = torch.argmax(torch.tensor(outputs[2])).to(device).item() 
    

    return topic_predictions, sentiment_prediction,ner_predictions

In [7]:
def predict_text(text):
    text = text_preprocessing(text)
    preds = predictions(text)
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    tokens = tokens["input_ids"].squeeze().tolist()
    words = tokenizer.convert_ids_to_tokens(tokens)
    topic_entities = []
    topics, sentiment, ner = preds[0], preds[1], preds[2]
    
    for i in range(len(topics)):
        if topics[i] != 0 and words[i] not in ["[UNK]","[CLS]","[SEP]","[PAD]"] and ner[i] ==0:
            topic_entities.append((i,words[i],topics[i]))
    return text, topic_entities, sentiment

In [8]:
def merge_subwords(phrases):
    res = []
    for phrase in phrases:
        tokens = phrase.split()
        merged_tokens = []

        for token in tokens:
            if token.startswith("##") and merged_tokens:
                
                merged_tokens[-1] += token[2:]
            else:
                merged_tokens.append(token)

        res.append(" ".join(merged_tokens))
    return res

In [9]:
def extract_phrases(tagged_entries):
    phrases = []
    current_phrase = []
    last_index = -1

    for idx, word, pred in tagged_entries:
        if pred == 1:
            if current_phrase:
                phrases.append(' '.join(current_phrase))
            current_phrase = [word]
            last_index = idx
        elif pred == 2:
            if current_phrase and idx == last_index + 1:
                current_phrase.append(word)
                last_index = idx
            else:
                pass

    if current_phrase:
        phrases.append(' '.join(current_phrase))
    phrases = merge_subwords(phrases)
   
    return phrases


In [None]:
def chunking_text(text, max_words=50):
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    
    chunks = []
    current_chunk = []

    for sentence in sentences:
        words = sentence.strip().split()
        if not words:
            continue

        if sum(len(s.split()) for s in current_chunk) + len(words) > max_words:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
        else:
            current_chunk.append(sentence)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks


In [11]:
def remove_duplicates(lst):
    seen = []
    result = []
    for item in lst:
        if item not in seen:
            seen.append(item)
            result.append(item)
    return result


In [None]:
def topic_extraction(text, keywords,sentiment):
    if not isinstance(text, str) or not isinstance(keywords, list):
        return []

    original_words = text.split()
    lowered_words = [w.lower() for w in original_words]
    matched_phrases = []

    for phrase in keywords:
        phrase_words = phrase.lower().split()
        i = 0
        found = False

        while i < len(lowered_words):
            match = True
            temp_indexes = []

            for j, kw in enumerate(phrase_words):
                if i + j >= len(lowered_words):
                    match = False
                    break
                candidate = lowered_words[i + j]

                
                if kw == candidate or kw.strip('#') in candidate:
                    temp_indexes.append(i + j)
                else:
                    match = False
                    break

            if match:
                matched_text = ' '.join(original_words[idx] for idx in temp_indexes)
                matched_phrases.append(matched_text)
                found = True
                break  
            else:
                i += 1

        if not found:
            matched_phrases.append(phrase)

    
    final_phrases = []
    for p in matched_phrases:
        if not any(p in other and p != other for other in matched_phrases):
            if p.lower() not in stop_words:
                final_phrases.append(p)

    if len(final_phrases) == 0:
        return [], sentiment
    final_phrases = [phrase for phrase in final_phrases if phrase.lower() not in stop_words]
    keywords = remove_duplicates(final_phrases)
    return keywords, sentiment


In [None]:
def postprocessing(text):
    chunks = chunking_text(text)
    final_topics = []
    i = 0
    final_sentiment = []
    for chunk in chunks:
        i+=1
        _, entities, sentiment = predict_text(chunk)
        entities = extract_phrases(entities)
        keywords,sentiment = topic_extraction(text, entities,sentiment)
        final_sentiment.append(sentiment)
        final_topics.extend(keywords)

    if len(final_sentiment) >0:
        f = sum(final_sentiment)/len(final_sentiment)
        if f > 0.5:
            if f>1:
                sentiment = 2
            else:
                sentiment = 1
        else: 
            sentiment = 0
    else: 
            sentiment = 1
   

    return final_topics, id2sentiment_label[sentiment]

In [15]:
text = """
Today, Apple Pay officially arrived in the country, and with the news, some financial institutions revealed their alliance with the technology giant to include cards and accept payments made with iPhone, Apple Watch, iPad, and Mac. Nu Colombia, Bancolombia, Mastercard, and Visa were the first to announce that they will be connected to the payment method. As a 100% digital company, we want to always offer the most innovative and simple payment methods to our customers and thus promote the digitalization of Colombians.
Apple Pay is precisely a secure, convenient, simple, and highly innovative payment method. We are happy to announce that starting today, our customers can also make purchases using La Moradita through Apple Pay, said Catalina Bretón, General Manager of Nu Colombia. To make their payments, users must hold their iPhone or Apple Watch near a data terminal to make the contactless transaction. Every purchase with Apple Pay is secure because it is authenticated with Face ID, Touch ID, or the device's access code, as well as a dynamic, one-time security code. Apple Pay is accepted in supermarkets, convenience stores, pharmacies, restaurants, cafes, retail stores, and many more places. We believe in digitalization as a mechanism to accompany the life moments and needs of our customers. For more than seven years, we have participated in the evolution of contactless payment methods in the country, seeking to better connect with our customers and promoting the greater use of digital media, which, in a demanding context like the current one, represents not only ease but also contributes to reducing the risks associated with handling cash, says Cristina Arrastía, Vice President of Business at Bancolombia.
To use Apple Pay, you only need to be enrolled in the bank's alerts and notifications service to receive the necessary verification information at the time of card registration. On the iPhone, simply open the Wallet app, tap, and follow the steps to add Bancolombia debit and credit cards. By adding the card to the iPhone or Apple Watch, the customer can immediately begin using Apple Pay on the device. Meanwhile, the card companies announced that Apple Pay is initially available to Bancolombia Mastercard and Visa cardholders and Nu Colombia, with other financial institutions soon to launch, the financial technology company emphasized. "We are delighted to partner with our issuers to bring Apple Pay to Colombia. Our shopping habits are evolving. Consumers are moving away from cash more than ever and opting for faster digital and contactless payment experiences, and this is exactly what Apple Pay offers. With this launch, Mastercard continues to strengthen its commitment to more secure digital payments and offers Colombian cardholders a safe and seamless shopping experience," highlighted Federico Martínez, Country Manager for Mastercard in Colombia.
"""
keyword, sentiment = postprocessing(text)
print(f"Important Topics:")
print(keyword)
print(f"Sentiment:{sentiment}")

Important Topics:
['Apple Pay', 'alliance technology giant', 'cards accept payments', 'Apple Watch,', 'iPad,', 'innovative simple payment methods', 'promote digitalization colombians', 'Catalina Bretón,', 'general manager nu colombia', 'iPhone, Apple Watch,', 'contactless transaction.', 'secure authenticated face id', 'Touch ID,', "device ' s access code", 'well dynamic', 'one - time security code', 'convenience stores,', 'digitalization mechanism', 'contactless payment methods', 'better connect customers', 'reducing risks', 'verification information time card registration', 'open wallet app', 'bancolombia debit credit cards', 'adding card iphone apple watch', 'customers', 'apple pay device', 'Apple Pay', 'financial technology company', 'shopping habits', 'consumers moving away cash', 'digital contactless payment experiences', 'Apple Pay offers.', 'commitment secure digital payments', 'colombian cardholders safe seamless shopping experience', 'Federico Martínez,', 'country manager mast