## Approach 2: new model + Extractive summary

### Step 1: Load relevant packages

In [1]:
import re
import pandas as pd
pd.set_option('display.max_colwidth', 0)
import pandas as pd
import spacy
import time 
import nltk
# nltk.download('punkt')
pd.options.display.float_format = '{:.10f}'.format

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
import pytextrank

from tqdm.notebook import tqdm
tqdm.pandas()


from math import sqrt
from operator import itemgetter
import spacy


### Step 2: Load newly trained model

In [2]:
model_checkpoint = "nlptown/bert-base-multilingual-uncased-sentiment"

In [3]:
# # initialize our model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast = True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

### Step 3: Custom functions to increase text readability + generate extractive summary + compute sentiment 

#### Step 3.1: Cleaning Function

In [4]:
# The cleaning steps include:
    # Removing non-alphanumeric characters and specific French characters.
    # Removing URLs.
    # Removing "rt" (retweet) tags.
    # Removing excessive spaces and replacing consecutive spaces with a single space.
    # Stripping leading and trailing whitespace.
    # Replacing newline characters with spaces.
    # Removing the possessive form "'s" by replacing it with "s".
    # If any error occurs during the cleaning process, an error message is printed, and the original text is returned without any modifications.

def clean_string(text:str) -> str:
    try: 
        # utf8_apostrophe = b'\xe2\x80\x99'.decode("utf8")
        DATE_REGEX = r"\d+\/\d+\/\d+" # Regular expression pattern for matching date format
        text = re.sub(r"([^0-9A-Za-z àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ\~\°\&\“\%\'\:\;\!\-\’\"\.\,\?\€\$\t\n`(\d+\/\d+\/\d+)``\(.*\)`])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
        # Remove unwanted characters, URLs, and RT tags from the text
        text = re.sub(' {2,}', ' ', text) # Remove inconsistent spaces (2 or more spaces become a single space)
        text = re.sub(' \. ', '.', text) # Remove inconsistent spaces around periods
        # text = re.sub(utf8_apostrophe, "'", text)  # Substitute UTF-8 apostrophe with a regular apostrophe
        text = text.strip()  # Strip leading and trailing whitespace
        text = re.sub(r"\n", " ", text) # Replace newline characters with a space
        text = re.sub(r"\'s\b", "s", text) # Replace "'s" with "s" (possessive form)
        return text
    except Exception as e:
        print(f"Error cleaning text: {e}")
        return text

#### Algorithms
##### Algorithm 1: Text ranking Algorithm - top ranking sentences <br /> Algorithm 2: Biased/entity driven Text ranking Algorithm - top ranking sentences

`Note: Only run one algorithm at a time, variables are same for ease of running`

In [5]:
# Specify the algorithm you want to run
# algo_name = 'textrank' 
algo_name = 'biasedtextrank'

In [6]:
nlp_en = spacy.load("en_core_web_trf")
nlp_fr = spacy.load("fr_dep_news_trf")

In [7]:
if algo_name in ['textrank','biasedtextrank']:
    print(f'{algo_name} algorithm initnalized')
    nlp_en.add_pipe(algo_name)
    nlp_fr.add_pipe(algo_name)
else:
    raise Exception("Wrong algorithm inintalized")

biasedtextrank algorithm initnalized


#### Step 3.2: Extractive summary

In [8]:
def _extractive_summary(text:str, focus:str, key_phrases:int, no_of_sentences:int, lang:str, algo:str) -> str:
    """
    Extractive text summarization using the specified algorithm. 
    The algorithm in consideration is either Textrank and Biased-Textrank

    Args:
        text (str): The input text (for our case the complete html content) to summarize.
        focus (str): The focus keyword (for our case the Organizational entity) for biased summarization.
        key_phrases (int): The number of key phrases (topics) to consider. 
        no_of_sentences (int): The number of sentences to include in the summary. For the purpose of this work it is set to 12
        lang (str): The language of the input text ('fr' for French, 'en' for English).
        algo (str): The algorithm to use for summarization ('textrank' or 'biasedtextrank').
                    1. TextRank is an unsupervised graph-based ranking algorithm used for text summarization.
                    2. Biased TextRank is an extension of the TextRank algorithm that allows biased summarization based on a specific keyword.

    Returns:
        str: The summarized text.

    """
    limit_phrases = key_phrases
    limit_sentences = no_of_sentences
    
    # Choose the appropriate language model based on the input language
    nlp = nlp_fr if lang=='fr' else nlp_en
    doc = nlp(text)

    if algo == 'textrank':
        # Use TextRank algorithm for summarization
        # print("Algorithm: ", algo)
        pass
    elif algo == 'biasedtextrank':
        # Use Biased TextRank algorithm for summarization
        # Set the focus keyword and bias to influence the summary
        # The bias range selects sentences that are more focused on the organizational entity
        # print("Algorithm: ", algo)
        doc._.textrank.change_focus(focus, bias=10.0,  default_bias=0.0)

    # Initialize sentence boundaries and associated phrase IDs
    sent_bounds = [ [s.start, s.end, set([])] for s in doc.sents ]

    # unit_vector, sent_vector = _get_vectors(doc,no_of_key_phrases=5)
    
    phrase_id = 0
    unit_vector = []

    for p in doc._.phrases:
        # print(f'{phrase_id}, {p.text}, {p.rank}')
        # Collect the rank of each phrase to build the unit vector
        unit_vector.append(p.rank)

        for chunk in p.chunks:
            # print(f'{chunk.start}, {chunk.end}')
            # Associate each phrase with the corresponding sentence boundaries

            for sent_start, sent_end, sent_vector in sent_bounds:
                if chunk.start >= sent_start and chunk.end <= sent_end:
                    # print({sent_start}, {chunk.start}, {chunk.end}, {sent_end})
                    sent_vector.add(phrase_id)
                    break

        phrase_id += 1

        if phrase_id == limit_phrases:
            break

    sum_ranks = sum(unit_vector)
    unit_vector = [ rank/sum_ranks for rank in unit_vector ]

    sent_rank = {}
    sent_id = 0

    for sent_start, sent_end, sent_vector in sent_bounds:
        # print(sent_vector)
        # Calculate the rank of each sentence based on the unit vector
        sum_sq = 0.0
        
        for phrase_id in range(len(unit_vector)):
            # print(phrase_id, unit_vector[phrase_id])
            
            if phrase_id not in sent_vector:
                sum_sq += unit_vector[phrase_id]**2.0

        sent_rank[sent_id] = sqrt(sum_sq)
        sent_id += 1

    

    sent_text = {}
    sent_id = 0

    for sent in doc.sents:
        # Store the text of each sentence for later retrieval
        sent_text[sent_id] = sent.text
        sent_id += 1

    num_sent = 0

    top_10_sent_id = []
    for sent_id, rank in sorted(sent_rank.items(), key=itemgetter(1)):
        # Sort the sentences based on their ranks and select the top ones
        # print(sent_id, sent_text[sent_id])
        top_10_sent_id.append(sent_id)
        num_sent += 1
        
        if num_sent == limit_sentences:
            break

    top_10_sent_id.sort()
    # print('\n'.join([f"{sent_id}: {sent_text[sent_id]}" for sent_id in sent_text if sent_id in top_10_sent_id]))

    return ' '.join([sent_text[sent_id] for sent_id in sent_text if sent_id in top_10_sent_id])

#### Step 3.3: function to compute sentiment

In [9]:
def _calculate_sentiment(html_content:str, title_sentences:str, name:str, model, tokenizer ):

    # A quick cleaninig of html_content and title
    html_content = clean_string(text = html_content)  #
    title_sentences = clean_string(text = title_sentences) #
    html_content = re.sub(re.escape(title_sentences), "", html_content) # this code eliminates the duplicated title text in html_content
    content_sentences = nltk.sent_tokenize(html_content)
    content_inputs = tokenizer(content_sentences,  padding=True, truncation=True, return_tensors="pt").to(device)

    #  torch.no_grad() = context manager that temporarily disables gradient calculation, which reduces memory usage and 
    #  speeds up computation when you only need to perform forward passes through the network (e.g., during inference).
    with torch.no_grad(): 
        content_outputs = model(**content_inputs)

    # Softmex for probability scaling, i.e. sum of all probabilities = 1    
    content_scores = torch.nn.functional.softmax(content_outputs.logits, dim=-1)
    # Define the scale and shift it, in this case to --> -1, -0.5, 0, 0.5, 1
    weights = torch.linspace(-1, 1, content_scores.shape[1])
    normalised_content_scores = torch.matmul(content_scores, weights).tolist()
    

    # For title content
    title_inputs = tokenizer(title_sentences, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():       
        title_outputs = model(**title_inputs)
    title_scores = torch.nn.functional.softmax(title_outputs.logits, dim=-1)
    normalised_title_scores = torch.matmul(title_scores, weights).tolist()

    mentions = 0
    score = 0.0
    count = 0
    for sentence, sentiment in zip(content_sentences, normalised_content_scores):
        multiplier = 1
        if name.lower() in sentence.lower():
            mentions += sentence.lower().count(name.lower())
            multiplier = 5

        count += multiplier
        score += sentiment * multiplier

    for sentence, sentiment in zip(title_sentences, normalised_title_scores):
        mentions += sentence.lower().count(name.lower())
        multiplier = 5
        count += multiplier
        score += sentiment * multiplier

    average_score = score / count
    article_score = average_score * 50 + 50

    return round(article_score, 5), mentions

#### Step 3.4: Combine everything into one pipeline

In [10]:
def score(html_content:str, title_sentences:str, name:str, language:str, algorithm:str, model, tokenizer):
    summary = _extractive_summary(text = clean_string(html_content), focus = name.lower(),  key_phrases= 8, no_of_sentences = 12, lang=language, algo = algorithm)
    s_score, _ =_calculate_sentiment(summary, title_sentences, name, model, tokenizer )
    return s_score

In [11]:
html_text = "What is Volkswagen accused of?\nIt's been dubbed the \"diesel dupe\". In September, the Environmental Protection Agency (EPA) found that many VW cars being sold in America had a \"defeat device\" - or software - in diesel engines that could detect when they were being tested, changing the performance accordingly to improve results. The German car giant has since admitted cheating emissions tests in the US.\nVW has had a major push to sell diesel cars in the US, backed by a huge marketing campaign trumpeting its cars' low emissions. The EPA's findings cover 482,000 cars in the US only, including the VW-manufactured Audi A3, and the VW models Jetta, Beetle, Golf and Passat. But VW has admitted that about 11 million cars worldwide, including eight million in Europe, are fitted with the so-called \"defeat device\".\nThe company has also been accused by the EPA of modifying software on the 3 litre diesel engines fitted to some Porsche and Audi as well as VW models. VW has denied the claims, which affect at least 10,000 vehicles.\nIn November, VW said it had found \"irregularities\" in tests to measure carbon dioxide emissions levels that could affect about 800,000 cars in Europe - including petrol vehicles. However, in December it said that following investigations, it had established that this only affected about 36,000 of the cars it produces each year.\nThis 'defeat device' sounds like a sophisticated piece of kit.\nFull details of how it worked are sketchy, although the EPA has said that the engines had computer software that could sense test scenarios by monitoring speed, engine operation, air pressure and even the position of the steering wheel.\nWhen the cars were operating under controlled laboratory conditions - which typically involve putting them on a stationary test rig - the device appears to have put the vehicle into a sort of safety mode in which the engine ran below normal power and performance. Once on the road, the engines switched out of this test mode.\nThe result? The engines emitted nitrogen oxide pollutants up to 40 times above what is allowed in the US.\nWhat has been VW's response?\n\"We've totally screwed up,\" said VW America boss Michael Horn, while the group's chief executive at the time, Martin Winterkorn, said his company had \"broken the trust of our customers and the public\". Mr Winterkorn resigned as a direct result of the scandal and was replaced by Matthias Mueller, the former boss of Porsche.\n\"My most urgent task is to win back trust for the Volkswagen Group - by leaving no stone unturned,\" Mr Mueller said on taking up his new post.\nVW has also launched an internal inquiry.\nBut that's unlikely to be the end of the financial impact. The EPA has the power to fine a company up to $37,500 for each vehicle that breaches standards - a maximum fine of about $18bn.\nThe costs of possible legal action by car owners and shareholders \"cannot be estimated at the current time\", VW added.\nHow widespread are VW's problems?\nWhat started in the US has spread to a growing number of countries. The UK, Italy, France, South Korea, Canada and, of course, Germany, have opened investigations. Throughout the world, politicians, regulators and environmental groups are questioning the legitimacy of VW's emissions testing.\nVW will recall 8.5 million cars in Europe, including 2.4 million in Germany and 1.2 million in the UK, and 500,000 in the US as a result of the emissions scandal.\nNo wonder the carmaker's shares have fallen by about a third since the scandal broke.\nWill more heads roll?\nIt's still unclear who knew what and when, although VW must have had a chain of management command that approved fitting cheating devices to its engines, so further departures are likely.\nChristian Klingler, a management board member and head of sales and marketing is leaving the company, although VW said this was part of long-term planned structural changes and was not related to recent events.\nIn 2014, in the US, regulators raised concerns about VW emissions levels, but these were dismissed by the company as \"technical issues\" and \"unexpected\" real-world conditions. If executives and managers wilfully misled officials (or their own VW superiors) it's difficult to see them surviving.\nAre other carmakers implicated?\nThat's for the various regulatory and government inquiries to determine. California's Air Resources Board is now looking into other manufacturers' testing results. Ford, BMW and Renault-Nissan have said they did not use \"defeat devices\", while other firms have either not commented or simply stated that they comply with the law.\nThe UK trade body for the car industry, the SMMT, said: \"The EU operates a fundamentally different system to the US - with all European tests performed in strict conditions as required by EU law and witnessed by a government-appointed independent approval agency.\"\nBut it added: \"The industry acknowledges that the current test method is outdated and is seeking agreement from the European Commission for a new emissions test that embraces new testing technologies and is more representative of on-road conditions.\"\nThat sounds like EU testing rules need tightening, too.\nEnvironmental campaigners have long argued that emissions rules are being flouted. \"Diesel cars in Europe operate with worse technology on average than the US,\" said Jos Dings, from the pressure group Transport & Environment. \"Our latest report demonstrated that almost 90% of diesel vehicles didn't meet emission limits when they drive on the road. We are talking millions of vehicles.\"\nCar analysts at the financial research firm Bernstein agree that European standards are not as strict as those in the US. However, the analysts said in a report that there was, therefore, \"less need to cheat\". So, if other European carmakers' results are suspect, Bernstein says the \"consequences are likely to be a change in the test cycle rather than legal action and fines\".\nIt's all another blow for the diesel market.\nCertainly is. Over the past decade and more, carmakers have poured a fortune into the production of diesel vehicles - with the support of many governments - believing that they are better for the environment. Latest scientific evidence suggests that's not the case, and there are even moves to limit diesel cars in some cities.\nDiesel sales were already slowing, so the VW scandal came at a bad time. \"The revelations are likely to lead to a sharp fall in demand for diesel engine cars,\" said Richard Gane, automotive expert at consultants Vendigital.\n\"In the US, the diesel car market currently represents around 1% of all new car sales and this is unlikely to increase in the short to medium term.\n\"However, in Europe the impact could be much more significant, leading to a large tranche of the market switching to petrol engine cars virtually overnight.\""
summary = "In September, the Environmental Protection Agency (EPA) found that many VW cars being sold in America had a \"defeat device\" - or software - in diesel engines that could detect when they were being tested, changing the performance accordingly to improve results. VW has had a major push to sell diesel cars in the US, backed by a huge marketing campaign trumpeting its cars' low emissions. The company has also been accused by the EPA of modifying software on the 3 litre diesel engines fitted to some Porsche and Audi as well as VW models. In November, VW said it had found \"irregularities\" in tests to measure carbon dioxide emissions levels that could affect about 800,000 cars in Europe - including petrol vehicles. VW will recall 8.5 million cars in Europe, including 2.4 million in Germany and 1.2 million in the UK, and 500,000 in the US as a result of the emissions scandal."
title = "Volkswagen: The scandal explained - BBC News"
name = "Volkswagen"

In [13]:
score(html_content=html_text, title_sentences=title, name=name, language='en', algorithm=algo_name, model = model, tokenizer=tokenizer)

36.7015

In [14]:
doc = nlp_en(html_text)

focus = "Volkswagen"
doc._.textrank.change_focus(focus,bias=10.0,  default_bias=0.0)
for sent in doc._.textrank.summary(limit_phrases=8, limit_sentences=12):
    print(sent)

What is Volkswagen accused of?

"My most urgent task is to win back trust for the Volkswagen Group - by leaving no stone unturned," Mr Mueller said on taking up his new post.
Mr Winterkorn resigned as a direct result of the scandal and was replaced by Matthias Mueller, the former boss of Porsche.
"The revelations are likely to lead to a sharp fall in demand for diesel engine cars," said Richard Gane, automotive expert at consultants Vendigital.
In September, the Environmental Protection Agency (EPA) found that many VW cars being sold in America had a "defeat device" - or software - in diesel engines that could detect when they were being tested, changing the performance accordingly to improve results.

It's been dubbed the "diesel dupe".
The German car giant has since admitted cheating emissions tests in the US.

VW has had a major push to sell diesel cars in the US, backed by a huge marketing campaign trumpeting its cars' low emissions.
The EPA's findings cover 482,000 cars in the US 