## Approach : Chunking the entire article into multiple windows of fix size of 512 tokens and feeding them one after another

### Step 1: Load relevant packages

In [1]:
import re
import pandas as pd
pd.set_option('display.max_colwidth', 0)
import pandas as pd

import time 
import nltk
# nltk.download('punkt')
pd.options.display.float_format = '{:.10f}'.format

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
from tqdm.notebook import tqdm
tqdm.pandas()

### Step 2: Load newly trained model

In [2]:
model_checkpoint = "nlptown/bert-base-multilingual-uncased-sentiment"

In [3]:
# # initialize our model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast = True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
# old_sentiment_pipeline = pipeline("sentiment-analysis", model=model ,tokenizer=tokenizer,  top_k=5)

### Step 3: Custom functions to increase text readability + head and tail 512 text + compute sentiment 

#### Step 3.1: Cleaning Function

In [4]:
# The cleaning steps include:
    # Removing non-alphanumeric characters and specific French characters.
    # Removing URLs.
    # Removing "rt" (retweet) tags.
    # Removing excessive spaces and replacing consecutive spaces with a single space.
    # Stripping leading and trailing whitespace.
    # Replacing newline characters with spaces.
    # Removing the possessive form "'s" by replacing it with "s".
    # If any error occurs during the cleaning process, an error message is printed, and the original text is returned without any modifications.

def clean_string(text:str) -> str:
    try: 
        # utf8_apostrophe = b'\xe2\x80\x99'.decode("utf8")
        DATE_REGEX = r"\d+\/\d+\/\d+" # Regular expression pattern for matching date format
        text = re.sub(r"([^0-9A-Za-z àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ\~\°\&\“\%\'\:\;\!\-\’\"\.\,\?\€\$\t\n`(\d+\/\d+\/\d+)``\(.*\)`])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
        # Remove unwanted characters, URLs, and RT tags from the text
        text = re.sub(' {2,}', ' ', text) # Remove inconsistent spaces (2 or more spaces become a single space)
        text = re.sub(' \. ', '.', text) # Remove inconsistent spaces around periods
        # text = re.sub(utf8_apostrophe, "'", text)  # Substitute UTF-8 apostrophe with a regular apostrophe
        text = text.strip()  # Strip leading and trailing whitespace
        text = re.sub(r"\n", " ", text) # Replace newline characters with a space
        text = re.sub(r"\'s\b", "s", text) # Replace "'s" with "s" (possessive form)
        return text
    except Exception as e:
        print(f"Error cleaning text: {e}")
        return text

#### Step 3.2: Create custom 512 token paragraph

In [5]:
def _long_text(tokens, chunksize):
    """
    Splits a long text into chunks of a specified size and prepares the input tensors for the chunks.
    
    Args:
        tokens (dict): Dictionary containing the tokenized text tensors, including 'input_ids' and 'attention_mask'.
        chunksize (int): The desired size of each text chunk.
        
    Returns:
        dict: Dictionary containing the input tensors for the text chunks, including 'input_ids' and 'attention_mask'.
    """
    # Avoided adding special tokens add_special_tokens=False because this will add [CLS] and [SEP] tokens to the start and end of the full tokenized tensor 
    # we will instead add them manually later.
    # We will not specify max_length, truncation, or padding parameters (as we do not use any of them here).
    # tokenizer = AutoTokenizer.from_pretrained(option)
    # define target chunksize
    

    # split into chunks of chunksize tokens, we also convert to list (default is tuple which is immutable)
    input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
    mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))

    # loop through each chunk
    for i in range(len(input_id_chunks)):
        # add CLS and SEP tokens to input IDs
        input_id_chunks[i] = torch.cat([
            torch.tensor([101]), input_id_chunks[i], torch.tensor([102])
        ])
        # add attention tokens to attention mask
        mask_chunks[i] = torch.cat([
            torch.tensor([1]), mask_chunks[i], torch.tensor([1])
        ])
        # get required padding length
        pad_len = chunksize - input_id_chunks[i].shape[0]
        # check if tensor length satisfies required chunk size
        if pad_len > 0:
            # if padding length is more than 0, we must add padding
            input_id_chunks[i] = torch.cat([
                input_id_chunks[i], torch.Tensor([0] * pad_len)
            ])
            mask_chunks[i] = torch.cat([
                mask_chunks[i], torch.Tensor([0] * pad_len)
            ])

    # Since we have different sections based on the token limit of 512 we will stack result using torch.stack
    input_ids = torch.stack(input_id_chunks)
    attention_mask = torch.stack(mask_chunks)

    input_dict = {
        'input_ids': input_ids.long(),
        'attention_mask': attention_mask.int()
    }
    

    return input_dict

#### Step 3.3: function to compute sentiment for entire 5article split into paragraphs of 512 tokens

In [6]:
def _calculate_sentiment(html_content:str, title_sentences:str, name:str, model, tokenizer ):
    """
    Calculates the sentiment score for a given HTML content and title using a specified model and tokenizer.

    Args:
        html_content (str): The HTML content to analyze.
        title_sentences (str): The title sentences to analyze.
        name (str): The name to search for in the content and title.
        model: The transformer model for sentiment analysis.
        tokenizer: The tokenizer associated with the model.

    Returns:
        float: The sentiment score of the article.
    """
    # For html_content
    html_content = clean_string(text = html_content)  #
    title_sentences = clean_string(text = title_sentences) #
    html_content = re.sub(re.escape(title_sentences), "", html_content) # this code eliminates the duplicated title text in html_content
    
    content_inputs = tokenizer.encode_plus(html_content, add_special_tokens=False, return_tensors='pt').to(device)
    input_dict = _long_text(content_inputs, chunksize = 512)
    with torch.no_grad(): 
        content_outputs = model(**input_dict)
    content_sentences = tokenizer.batch_decode(input_dict['input_ids'],skip_special_tokens=True, clean_up_tokenization_spaces=True)

    content_scores = torch.nn.functional.softmax(content_outputs.logits, dim=-1)
    normalised_content_scores = torch.matmul(
        content_scores, torch.tensor([-1.0, -0.5, 0, 0.5, 1.0])
    ).tolist()
    

    # For title content
    title_inputs = tokenizer(title_sentences, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():       
        title_outputs = model(**title_inputs)
    title_scores = torch.nn.functional.softmax(title_outputs.logits, dim=-1)
    normalised_title_scores = torch.matmul(
        title_scores, torch.tensor([-1.0, -0.5, 0, 0.5, 1.0])
    ).tolist()

    mentions = 0
    score = 0.0
    count = 0
    for sentence, sentiment in zip(
        content_sentences, normalised_content_scores
    ):
        multiplier = 1
        if name.lower() in sentence.lower():
            mentions += sentence.lower().count(name.lower())
            multiplier = 5

        count += multiplier
        score += sentiment * multiplier

    for sentence, sentiment in zip(title_sentences, normalised_title_scores):
        mentions += sentence.lower().count(name.lower())
        multiplier = 5
        count += multiplier
        score += sentiment * multiplier

    average_score = score / count
    article_score = average_score * 50 + 50

    return round(article_score, 5)

In [8]:
html_text = "What is Volkswagen accused of?\nIt's been dubbed the \"diesel dupe\". In September, the Environmental Protection Agency (EPA) found that many VW cars being sold in America had a \"defeat device\" - or software - in diesel engines that could detect when they were being tested, changing the performance accordingly to improve results. The German car giant has since admitted cheating emissions tests in the US.\nVW has had a major push to sell diesel cars in the US, backed by a huge marketing campaign trumpeting its cars' low emissions. The EPA's findings cover 482,000 cars in the US only, including the VW-manufactured Audi A3, and the VW models Jetta, Beetle, Golf and Passat. But VW has admitted that about 11 million cars worldwide, including eight million in Europe, are fitted with the so-called \"defeat device\".\nThe company has also been accused by the EPA of modifying software on the 3 litre diesel engines fitted to some Porsche and Audi as well as VW models. VW has denied the claims, which affect at least 10,000 vehicles.\nIn November, VW said it had found \"irregularities\" in tests to measure carbon dioxide emissions levels that could affect about 800,000 cars in Europe - including petrol vehicles. However, in December it said that following investigations, it had established that this only affected about 36,000 of the cars it produces each year.\nThis 'defeat device' sounds like a sophisticated piece of kit.\nFull details of how it worked are sketchy, although the EPA has said that the engines had computer software that could sense test scenarios by monitoring speed, engine operation, air pressure and even the position of the steering wheel.\nWhen the cars were operating under controlled laboratory conditions - which typically involve putting them on a stationary test rig - the device appears to have put the vehicle into a sort of safety mode in which the engine ran below normal power and performance. Once on the road, the engines switched out of this test mode.\nThe result? The engines emitted nitrogen oxide pollutants up to 40 times above what is allowed in the US.\nWhat has been VW's response?\n\"We've totally screwed up,\" said VW America boss Michael Horn, while the group's chief executive at the time, Martin Winterkorn, said his company had \"broken the trust of our customers and the public\". Mr Winterkorn resigned as a direct result of the scandal and was replaced by Matthias Mueller, the former boss of Porsche.\n\"My most urgent task is to win back trust for the Volkswagen Group - by leaving no stone unturned,\" Mr Mueller said on taking up his new post.\nVW has also launched an internal inquiry.\nBut that's unlikely to be the end of the financial impact. The EPA has the power to fine a company up to $37,500 for each vehicle that breaches standards - a maximum fine of about $18bn.\nThe costs of possible legal action by car owners and shareholders \"cannot be estimated at the current time\", VW added.\nHow widespread are VW's problems?\nWhat started in the US has spread to a growing number of countries. The UK, Italy, France, South Korea, Canada and, of course, Germany, have opened investigations. Throughout the world, politicians, regulators and environmental groups are questioning the legitimacy of VW's emissions testing.\nVW will recall 8.5 million cars in Europe, including 2.4 million in Germany and 1.2 million in the UK, and 500,000 in the US as a result of the emissions scandal.\nNo wonder the carmaker's shares have fallen by about a third since the scandal broke.\nWill more heads roll?\nIt's still unclear who knew what and when, although VW must have had a chain of management command that approved fitting cheating devices to its engines, so further departures are likely.\nChristian Klingler, a management board member and head of sales and marketing is leaving the company, although VW said this was part of long-term planned structural changes and was not related to recent events.\nIn 2014, in the US, regulators raised concerns about VW emissions levels, but these were dismissed by the company as \"technical issues\" and \"unexpected\" real-world conditions. If executives and managers wilfully misled officials (or their own VW superiors) it's difficult to see them surviving.\nAre other carmakers implicated?\nThat's for the various regulatory and government inquiries to determine. California's Air Resources Board is now looking into other manufacturers' testing results. Ford, BMW and Renault-Nissan have said they did not use \"defeat devices\", while other firms have either not commented or simply stated that they comply with the law.\nThe UK trade body for the car industry, the SMMT, said: \"The EU operates a fundamentally different system to the US - with all European tests performed in strict conditions as required by EU law and witnessed by a government-appointed independent approval agency.\"\nBut it added: \"The industry acknowledges that the current test method is outdated and is seeking agreement from the European Commission for a new emissions test that embraces new testing technologies and is more representative of on-road conditions.\"\nThat sounds like EU testing rules need tightening, too.\nEnvironmental campaigners have long argued that emissions rules are being flouted. \"Diesel cars in Europe operate with worse technology on average than the US,\" said Jos Dings, from the pressure group Transport & Environment. \"Our latest report demonstrated that almost 90% of diesel vehicles didn't meet emission limits when they drive on the road. We are talking millions of vehicles.\"\nCar analysts at the financial research firm Bernstein agree that European standards are not as strict as those in the US. However, the analysts said in a report that there was, therefore, \"less need to cheat\". So, if other European carmakers' results are suspect, Bernstein says the \"consequences are likely to be a change in the test cycle rather than legal action and fines\".\nIt's all another blow for the diesel market.\nCertainly is. Over the past decade and more, carmakers have poured a fortune into the production of diesel vehicles - with the support of many governments - believing that they are better for the environment. Latest scientific evidence suggests that's not the case, and there are even moves to limit diesel cars in some cities.\nDiesel sales were already slowing, so the VW scandal came at a bad time. \"The revelations are likely to lead to a sharp fall in demand for diesel engine cars,\" said Richard Gane, automotive expert at consultants Vendigital.\n\"In the US, the diesel car market currently represents around 1% of all new car sales and this is unlikely to increase in the short to medium term.\n\"However, in Europe the impact could be much more significant, leading to a large tranche of the market switching to petrol engine cars virtually overnight.\""
summary = "In September, the Environmental Protection Agency (EPA) found that many VW cars being sold in America had a \"defeat device\" - or software - in diesel engines that could detect when they were being tested, changing the performance accordingly to improve results. VW has had a major push to sell diesel cars in the US, backed by a huge marketing campaign trumpeting its cars' low emissions. The company has also been accused by the EPA of modifying software on the 3 litre diesel engines fitted to some Porsche and Audi as well as VW models. In November, VW said it had found \"irregularities\" in tests to measure carbon dioxide emissions levels that could affect about 800,000 cars in Europe - including petrol vehicles. VW will recall 8.5 million cars in Europe, including 2.4 million in Germany and 1.2 million in the UK, and 500,000 in the US as a result of the emissions scandal."
title = "Volkswagen: The scandal explained - BBC News"
name = "Volkswagen"

In [9]:
article_score = _calculate_sentiment(html_content = html_text, title_sentences = title, name=name, model= model, tokenizer=tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (1453 > 512). Running this sequence through the model will result in indexing errors


In [10]:
article_score

26.4511