In [1]:
# !wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

In [2]:
# !pip install transformers

In [3]:
import numpy as np
import pandas as pd
import textwrap
from pprint import pprint

from transformers import pipeline

from typing import Literal, List

# augment legngth of lines printed by pandas
pd.set_option("display.max_colwidth", 200)
np.random.seed(1234)

In [4]:
df = pd.read_csv("bbc_text_cls.csv")

In [5]:
df.head()

Unnamed: 0,text,labels
0,"Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is n...",business
1,Dollar gains on Greenspan speech\n\nThe dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.\n\nAn...,business
2,Yukos unit buyer faces loan claim\n\nThe owners of embattled Russian oil giant Yukos are to ask the buyer of its former production unit to pay back a $900m (£479m) loan.\n\nState-owned Rosneft bou...,business
3,"High fuel prices hit BA's profits\n\nBritish Airways has blamed high fuel prices for a 40% drop in profits.\n\nReporting its results for the three months to 31 December 2004, the airline made a pr...",business
4,Pernod takeover talk lifts Domecq\n\nShares in UK drinks and food firm Allied Domecq have risen on speculation that it could be the target of a takeover by France's Pernod Ricard.\n\nReports in th...,business


In [6]:
def beautifyText(t: str):
    return textwrap.fill(t, replace_whitespace=False, fix_sentence_endings=True)

In [7]:
df["text_beauty"] = df["text"].apply(beautifyText)

In [8]:
df.head()

Unnamed: 0,text,labels,text_beauty
0,"Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is n...",business,"Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant\nTimeWarner jumped 76% to $1.13bn (£600m) for the three months to\nDecember, from $639m year-earlier.\n\nThe firm, which is..."
1,Dollar gains on Greenspan speech\n\nThe dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.\n\nAn...,business,Dollar gains on Greenspan speech\n\nThe dollar has hit its highest level\nagainst the euro in almost three months after the Federal Reserve head\nsaid the US trade deficit is set to stabilise.\n\n...
2,Yukos unit buyer faces loan claim\n\nThe owners of embattled Russian oil giant Yukos are to ask the buyer of its former production unit to pay back a $900m (£479m) loan.\n\nState-owned Rosneft bou...,business,Yukos unit buyer faces loan claim\n\nThe owners of embattled Russian oil\ngiant Yukos are to ask the buyer of its former production unit to pay\nback a $900m (£479m) loan.\n\nState-owned Rosneft b...
3,"High fuel prices hit BA's profits\n\nBritish Airways has blamed high fuel prices for a 40% drop in profits.\n\nReporting its results for the three months to 31 December 2004, the airline made a pr...",business,"High fuel prices hit BA's profits\n\nBritish Airways has blamed high\nfuel prices for a 40% drop in profits.\n\nReporting its results for the\nthree months to 31 December 2004, the airline made a ..."
4,Pernod takeover talk lifts Domecq\n\nShares in UK drinks and food firm Allied Domecq have risen on speculation that it could be the target of a takeover by France's Pernod Ricard.\n\nReports in th...,business,Pernod takeover talk lifts Domecq\n\nShares in UK drinks and food firm\nAllied Domecq have risen on speculation that it could be the target of\na takeover by France's Pernod Ricard.\n\nReports in ...


In [9]:
print(df["text_beauty"].iloc[0])

Ad sales boost Time Warner profit

Quarterly profits at US media giant
TimeWarner jumped 76% to $1.13bn (£600m) for the three months to
December, from $639m year-earlier.

The firm, which is now one of the
biggest investors in Google, benefited from sales of high-speed
internet connections and higher advert sales.  TimeWarner said fourth
quarter sales rose 2% to $11.1bn from $10.9bn.  Its profits were
buoyed by one-off gains which offset a profit dip at Warner Bros, and
less users for AOL.

Time Warner said on Friday that it now owns 8% of
search-engine Google.  But its own internet business, AOL, had has
mixed fortunes.  It lost 464,000 subscribers in the fourth quarter
profits were lower than in the preceding three quarters.  However, the
company said AOL's underlying profit before exceptional items rose 8%
on the back of stronger internet advertising revenues.  It hopes to
increase subscribers by offering the online service free to TimeWarner
internet customers and will try to sign 

In [10]:
# labels = set(df["labels"])
labels = df["labels"].unique().tolist()
labels

['business', 'entertainment', 'politics', 'sport', 'tech']

In [11]:
label = 'business'

In [12]:
texts = df[df["labels"] == label]["text_beauty"]

In [13]:
texts

0      Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant\nTimeWarner jumped 76% to $1.13bn (£600m) for the three months to\nDecember, from $639m year-earlier.\n\nThe firm, which is...
1      Dollar gains on Greenspan speech\n\nThe dollar has hit its highest level\nagainst the euro in almost three months after the Federal Reserve head\nsaid the US trade deficit is set to stabilise.\n\n...
2      Yukos unit buyer faces loan claim\n\nThe owners of embattled Russian oil\ngiant Yukos are to ask the buyer of its former production unit to pay\nback a $900m (£479m) loan.\n\nState-owned Rosneft b...
3      High fuel prices hit BA's profits\n\nBritish Airways has blamed high\nfuel prices for a 40% drop in profits.\n\nReporting its results for the\nthree months to 31 December 2004, the airline made a ...
4      Pernod takeover talk lifts Domecq\n\nShares in UK drinks and food firm\nAllied Domecq have risen on speculation that it could be the target of\na takeover by France'

In [14]:
i = np.random.choice(texts.shape[0])
doc = texts.iloc[i]

In [15]:
print(doc)

Bombardier chief to leave company

Shares in train and plane-making
giant Bombardier have fallen to a 10-year low following the departure
of its chief executive and two members of the board.

Paul Tellier,
who was also Bombardier's president, left the company amid an ongoing
restructuring.  Laurent Beaudoin, part of the family that controls the
Montreal-based firm, will take on the role of CEO under a newly
created management structure.  Analysts said the resignations seem to
have stemmed from a boardroom dispute.  Under Mr Tellier's tenure at
the company, which began in January 2003, plans to cut the worldwide
workforce of 75,000 by almost a third by 2006 were announced.  The
firm's snowmobile division and defence services unit were also sold
and Bombardier started the development of a new aircraft seating 110
to 135 passengers.

Mr Tellier had indicated he wanted to stay at the
world's top train maker and third largest manufacturer of civil
aircraft until the restructuring was comple

In [16]:
mlm = pipeline("fill-mask")

No model was supplied, defaulted to distilroberta-base and revision ec58a5b (https://huggingface.co/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
mlm("Bombardier chief to leave <mask>.")

[{'score': 0.159628227353096,
  'token': 1470,
  'token_str': ' France',
  'sequence': 'Bombardier chief to leave France.'},
 {'score': 0.10781937092542648,
  'token': 2201,
  'token_str': ' Paris',
  'sequence': 'Bombardier chief to leave Paris.'},
 {'score': 0.08278702944517136,
  'token': 5817,
  'token_str': ' Montreal',
  'sequence': 'Bombardier chief to leave Montreal.'},
 {'score': 0.053658682852983475,
  'token': 558,
  'token_str': ' office',
  'sequence': 'Bombardier chief to leave office.'},
 {'score': 0.0398067831993103,
  'token': 896,
  'token_str': ' Canada',
  'sequence': 'Bombardier chief to leave Canada.'}]

In [18]:
# make string (dividing in multipiple string lines) from the following text:
# Shares in train and plane-making
# giant Bombardier have fallen to a 10-year low following the departure
# of its chief executive and two members of the board.

text =  "Shares in <mask> and plane-making\n"
text += "giant Bombardier have fallen to a 10-year low following the departure\n"
text += "of its chief executive and two members of the board."

mlm(text)

[{'score': 0.29885196685791016,
  'token': 15064,
  'token_str': ' aerospace',
  'sequence': 'Shares in aerospace and plane-making\ngiant Bombardier have fallen to a 10-year low following the departure\nof its chief executive and two members of the board.'},
 {'score': 0.22622795403003693,
  'token': 9848,
  'token_str': ' aviation',
  'sequence': 'Shares in aviation and plane-making\ngiant Bombardier have fallen to a 10-year low following the departure\nof its chief executive and two members of the board.'},
 {'score': 0.10739992558956146,
  'token': 11016,
  'token_str': ' Airbus',
  'sequence': 'Shares in Airbus and plane-making\ngiant Bombardier have fallen to a 10-year low following the departure\nof its chief executive and two members of the board.'},
 {'score': 0.06216571852564812,
  'token': 8537,
  'token_str': ' airlines',
  'sequence': 'Shares in airlines and plane-making\ngiant Bombardier have fallen to a 10-year low following the departure\nof its chief executive and two m

In [19]:

text =  "Shares in train and plane-making\n"
text += "giant Bombardier have fallen to a 10-year low following the <mask>\n"
text += "of its chief executive and two members of the board."

mlm(text)

[{'score': 0.44914424419403076,
  'token': 6985,
  'token_str': ' resignation',
  'sequence': 'Shares in train and plane-making\ngiant Bombardier have fallen to a 10-year low following the resignation\nof its chief executive and two members of the board.'},
 {'score': 0.2614488899707794,
  'token': 5824,
  'token_str': ' departure',
  'sequence': 'Shares in train and plane-making\ngiant Bombardier have fallen to a 10-year low following the departure\nof its chief executive and two members of the board.'},
 {'score': 0.09727979451417923,
  'token': 14289,
  'token_str': ' dismissal',
  'sequence': 'Shares in train and plane-making\ngiant Bombardier have fallen to a 10-year low following the dismissal\nof its chief executive and two members of the board.'},
 {'score': 0.07329631596803665,
  'token': 25624,
  'token_str': ' departures',
  'sequence': 'Shares in train and plane-making\ngiant Bombardier have fallen to a 10-year low following the departures\nof its chief executive and two me

In [20]:

text =  "Shares in train and plane-making\n"
text += "giant Bombardier have fallen to a 10-year low following the departure\n"
text += "of its chief <mask> and two members of the board."

mlm(text)

[{'score': 0.9742476940155029,
  'token': 1031,
  'token_str': ' executive',
  'sequence': 'Shares in train and plane-making\ngiant Bombardier have fallen to a 10-year low following the departure\nof its chief executive and two members of the board.'},
 {'score': 0.018965229392051697,
  'token': 4585,
  'token_str': ' executives',
  'sequence': 'Shares in train and plane-making\ngiant Bombardier have fallen to a 10-year low following the departure\nof its chief executives and two members of the board.'},
 {'score': 0.0022506467066705227,
  'token': 1036,
  'token_str': ' officer',
  'sequence': 'Shares in train and plane-making\ngiant Bombardier have fallen to a 10-year low following the departure\nof its chief officer and two members of the board.'},
 {'score': 0.001639050547964871,
  'token': 8083,
  'token_str': ' engineer',
  'sequence': 'Shares in train and plane-making\ngiant Bombardier have fallen to a 10-year low following the departure\nof its chief engineer and two members of

In [21]:

text =  "Shares in train and plane-making\n"
text += "giant Bombardier have fallen to a 10-year low following the departure\n"
text += "of its chief executive and two <mask> of the board."

mlm(text)

[{'score': 0.788566529750824,
  'token': 453,
  'token_str': ' members',
  'sequence': 'Shares in train and plane-making\ngiant Bombardier have fallen to a 10-year low following the departure\nof its chief executive and two members of the board.'},
 {'score': 0.1775282621383667,
  'token': 29193,
  'token_str': ' thirds',
  'sequence': 'Shares in train and plane-making\ngiant Bombardier have fallen to a 10-year low following the departure\nof its chief executive and two thirds of the board.'},
 {'score': 0.016065888106822968,
  'token': 5392,
  'token_str': ' directors',
  'sequence': 'Shares in train and plane-making\ngiant Bombardier have fallen to a 10-year low following the departure\nof its chief executive and two directors of the board.'},
 {'score': 0.0031229080632328987,
  'token': 10826,
  'token_str': ' chairs',
  'sequence': 'Shares in train and plane-making\ngiant Bombardier have fallen to a 10-year low following the departure\nof its chief executive and two chairs of the b

In [22]:
# Write a function that automatically masks and replaces words
# in a whole document. You might choose which words to replace based on some
# statistic, e.g. highest TF-IDF.

# 1. We need to take the corpus as a List (outer) of lists (inner),
# the outer list is the list of documents, each inner list is the list of tokens from that document

import re
import math
from collections import defaultdict


def get_tokens(text: str):
    """Gets the tokens from a string.

    Args:
      string: The string to be tokenized.

    Returns:
      A list of tokens.
    """

    pattern = r"[\w']+"
    tokens = re.findall(pattern, text)

    return tokens


corpus = [get_tokens(doc_str) for doc_str in texts.values.tolist()]
corpus_texts = texts.values.tolist()

# 2. We will need to get all the tokens of corpus, in a varable: corpus_tokens_set
corpus_tokens_set = set()
for i in range(len(corpus)):
    for j in range(len(corpus[i])):
        corpus_tokens_set.add(corpus[i][j])

# 3. Calculate idf of all tokens in corpus_tokens_set
def get_idf_dict(corpus: List[List[str]], corpus_tokens_set: set):
    """get the inverse document frequency (idf) dict for corpus

    Args:
        corpus (List[List[str]]): _description_
        corpus_tokens_set (set): _description_

    Returns:
        _type_: _description_
    """
    idf_dict = defaultdict(float)
    for tok in corpus_tokens_set:
        doc_count = 0
        for doc_tokens in corpus:
            if tok in doc_tokens:
                doc_count += 1
        idf_dict[tok] = math.log(len(corpus) / doc_count)
    return idf_dict


idf_dict = get_idf_dict(corpus, corpus_tokens_set)

# 4. Calculate term frequencies (tf) dict of a doc.
def get_tf_dict(doc_tokens):
    """get term frequency (tf) dict for a given document

    Args:
        doc_tokens (_type_): _description_

    Returns:
        _type_: _description_
    """
    tf_dict = defaultdict(float)
    for tok in doc_tokens:
        tf_dict[tok] += 1 / len(doc_tokens)
    return tf_dict

# 5. Calculate tf-idf dict for a given doc.
def get_tf_idf_dict(tf_dict, idf_dict):
    """get the term frequency inverse document frequency (tf-idf) dict for a given document

    Args:
        tf_dict (_type_): _description_
        idf_dict (_type_): _description_

    Returns:
        _type_: _description_
    """
    tf_idf_dict = defaultdict(float)
    for tok in tf_dict:
        tf_idf_dict[tok] = tf_dict[tok] * idf_dict[tok]
    return tf_idf_dict

# 6. Function to get the top tf-idf tokens
def get_top_tf_idf_tokens(tf_idf_dict, percent):
    n_top = math.floor(len(tf_idf_dict) * percent)
    sorted_tf_idf_values = sorted(tf_idf_dict.items(), key=lambda x: x[1], reverse=True)
    top_n_tokens = [token for token, tf_idf in sorted_tf_idf_values[:n_top]]
    return top_n_tokens
  
# 7. Finally, function to replace the words with greatest tf-idf scores.
def replaceWords(doc_text, doc_tokens, mlm_model, percent, idf_dict):
    tf_dict = get_tf_dict(doc_tokens)
    tf_idf_dict = get_tf_idf_dict(tf_dict, idf_dict)
    tokens_to_replace = get_top_tf_idf_tokens(tf_idf_dict, percent)
    
    doc_lines = doc_text.split("\n")      
    doc_lines_cp = doc_lines.copy()
    
    replaced_text = ""
    replaced_lines = []
    for idx_line, line in enumerate(doc_lines):
        # line_cp = line.copy()
        tokens = line.split(" ")
        tokens_cp = tokens.copy()
        new_line_tokens = []
        for idx_token, token in enumerate(tokens):
            if token in tokens_to_replace:
                tokens_cp[idx_token] = "<mask>"
                line_cp = " ".join(tokens_cp)
                
                new_token = mlm_model(line_cp)[0]["token_str"].strip()
                new_line_tokens.append(new_token)
                
                tokens_cp[idx_token] = tokens[idx_token]
                line_cp = line
            else:
                new_line_tokens.append(token)
        replaced_lines.append(" ".join(new_line_tokens))
    replaced_text = "\n".join(replaced_lines)
    return replaced_text

# Now we run to get some examples.
print("Examples of the replacing terms:\n\n\n")
for i in range(10):
    print("Original text excerpt:")
    print(corpus_texts[i][:300])
    print("\n---------------------")
    replaced_text = replaceWords(corpus_texts[i], corpus[i], mlm, 0.1, idf_dict)
    
    print("Replaced text excerpt:")
    print(replaced_text[:300])         
    
    print("\n\n\n---------------------\n---------Next--------\n---------------------\n\n\n")


Examples of the replacing terms:



Original text excerpt:
Ad sales boost Time Warner profit

Quarterly profits at US media giant
TimeWarner jumped 76% to $1.13bn (£600m) for the three months to
December, from $639m year-earlier.

The firm, which is now one of the
biggest investors in Google, benefited from sales of high-speed
internet connections and highe

---------------------
Replaced text excerpt:
Ad ventures boost Time Warner Cable

Record wins at US media giant
Sales jumped 76% to $1.13bn (£600m) for the three months to
December, from $639m year-earlier.

The firm, which is now one of the
biggest investors in Google, benefited from adoption of high-speed
broadband connections and higher sma



---------------------
---------Next--------
---------------------



Original text excerpt:
Dollar gains on Greenspan speech

The dollar has hit its highest level
against the euro in almost three months after the Federal Reserve head
said the US trade deficit is set to stabilise.

And Alan