In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from bertopic import BERTopic
import pandas as pd 
from transformers.pipelines import pipeline
import os


# Loading articles

In [4]:
df_articles = pd.read_parquet("../files/bbc_news.parquet", engine="pyarrow")
df_articles.head(1)


Unnamed: 0,title,published_date,authors,description,section,content,link,top_image
0,Saido Berahino: Stoke complete deal to sign We...,2017-01-21,,Stoke sign West Brom striker Saido Berahino fo...,,Last updated on .From the section Football\n\n...,http://www.bbc.co.uk/sport/football/38696547,https://ichef.bbci.co.uk/onesport/cps/624/cpsp...


In [14]:
df_articles = df_articles.drop_duplicates('content').reset_index(drop=True)

In [15]:
df_articles['text'] = df_articles.apply(lambda x: 'Article Title: ' + x['title'] + '\n' + "Article Body :" + x['content'], axis=1)

In [16]:
docs = df_articles['text'].tolist()

In [9]:
print(docs[0][0:200])

Article Title: Saido Berahino: Stoke complete deal to sign West Brom's 23-year-old striker - BBC Sport
Article Body :Last updated on .From the section Football

Stoke have signed West Brom striker Sai


# Training Bertopic

In [8]:
topic_model = BERTopic()

In [9]:
topics, probs = topic_model.fit_transform(docs)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

# Pushing model to Hugging Face 


In [10]:
# topic_model.push_to_hf_hub(
    "CarlosMorales/bbc_news_topics",
    save_ctfidf=True,
    save_embedding_model="sentence-transformers/all-MiniLM-L6-v2"
)

ctfidf.safetensors:   0%|          | 0.00/52.1M [00:00<?, ?B/s]

topic_embeddings.safetensors:   0%|          | 0.00/2.02M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/CarlosMorales/bbc_news_topics/commit/c84783a40b94a70e3d127c46228d183d9bab9272', commit_message='Add BERTopic model', commit_description='', oid='c84783a40b94a70e3d127c46228d183d9bab9272', pr_url=None, pr_revision=None, pr_num=None)

# Loading model from HF 

In [7]:
topic_model = BERTopic.load("CarlosMorales/bbc_news_topics")


# Finding articles about conflicts 

In [13]:
bertopic_docs = topic_model.get_document_info(df_articles)
bertopic_docs

TypeError: Make sure to supply a list of strings, not a dataframe.

In [30]:
ukraine = {
    'start_date': '2022-02-24',
    'topic_name': 'Ukraine Russia War',
}
palestine = {
    'start_date': '2022-10-07',
    'topic_name': 'Israel Palestine War',
}

list_conflicts = [ukraine, palestine]

In [31]:
bertopic_docs = topic_model.get_document_info(docs)
df = pd.concat([df_articles, bertopic_docs], axis=1)
df.head(1)

Unnamed: 0,title,published_date,authors,description,section,content,link,top_image,text,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Representative_document
0,Saido Berahino: Stoke complete deal to sign We...,2017-01-21,,Stoke sign West Brom striker Saido Berahino fo...,,Last updated on .From the section Football\n\n...,http://www.bbc.co.uk/sport/football/38696547,https://ichef.bbci.co.uk/onesport/cps/624/cpsp...,Article Title: Saido Berahino: Stoke complete ...,Article Title: Saido Berahino: Stoke complete ...,-1,-1_and_her_was_of,"[and, her, was, of, to, is, content, for, she,...",,and - her - was - of - to - is - content - for...,False


In [32]:
list_conflicts

[{'start_date': '2022-02-24', 'topic_name': 'Ukraine Russia War'},
 {'start_date': '2022-10-07', 'topic_name': 'Israel Palestine War'}]

In [58]:
conflict_list = []

for conflict in list_conflicts:
    start_date = conflict['start_date']
    topic_name = conflict['topic_name']
    topic_numbers = topic_model.find_topics(topic_name, top_n=3)[0]
    
    df_temp = df[df.Topic.isin(topic_numbers)]
    df_temp = df_temp[(df_temp['content'].map(len) > 800) & (df_temp['content'].map(len) < 2000)]
    df_temp = df_temp[df_temp['published_date'] > "2017-01-21"]
    df_temp['conflict'] = topic_name  

    conflict_list.append(df_temp)
    
    print(f"Number of articles for {topic_name} after {start_date}: {len(df_temp)}")
    print(f"Topic names for {topic_name} are {df_temp.Name.unique()}")


Number of articles for Ukraine Russia War after 2022-02-24: 12
Topic names for Ukraine Russia War are ['59_russian_ukraine_ukrainian_war' '778_ukraine_ukraines_russian_russia']
Number of articles for Israel Palestine War after 2022-10-07: 11
Topic names for Israel Palestine War are ['243_gaza_israel_hamas_israeli' '1110_israel_netanyahu_gaza_biden']


In [60]:
df_conflict_articles = pd.concat(conflict_list)
df_conflict_articles = df_conflict_articles[['conflict', 'title', 'published_date',  'description', 'section', 'content', 'link', 
                                             'Name', 'Representation', 'Top_n_words', 'Representative_document' ]]
df_conflict_articles = df_conflict_articles.reset_index(drop=True)
df_conflict_articles


Unnamed: 0,conflict,title,published_date,description,section,content,link,Name,Representation,Top_n_words,Representative_document
0,Ukraine Russia War,Ukraine war latest: Biden calls for Putin war ...,2022-04-03,"The BBC has seen fresh evidence of atrocities,...",Europe,"Olha Sukhenko, her son Oleksander and husband ...",http://www.bbc.co.uk/news/live/world-europe-60...,59_russian_ukraine_ukrainian_war,"[russian, ukraine, ukrainian, war, soldiers, b...",russian - ukraine - ukrainian - war - soldiers...,False
1,Ukraine Russia War,Ukraine war latest: Battles in Donbas will loo...,2022-04-07,Foreign Minister Dmytro Kuleba calls on Nato a...,Europe,"Land warfare analyst Nick Reynolds, who works ...",http://www.bbc.co.uk/news/live/world-europe-60...,778_ukraine_ukraines_russian_russia,"[ukraine, ukraines, russian, russia, counterof...",ukraine - ukraines - russian - russia - counte...,False
2,Ukraine Russia War,Ukraine latest news: Leaders positive about EU...,2022-06-14,"Germany, France, Italy and Romania support Ukr...",Europe,In a field about six miles from the eastern Uk...,http://www.bbc.co.uk/news/live/world-europe-61...,59_russian_ukraine_ukrainian_war,"[russian, ukraine, ukrainian, war, soldiers, b...",russian - ukraine - ukrainian - war - soldiers...,False
3,Ukraine Russia War,Ukraine war latest: Russia defends death sente...,2022-06-10,Russia's foreign minister claims the captured ...,Europe,Shevchenko says his mother has been able to le...,http://www.bbc.co.uk/news/live/world-europe-61...,59_russian_ukraine_ukrainian_war,"[russian, ukraine, ukrainian, war, soldiers, b...",russian - ukraine - ukrainian - war - soldiers...,False
4,Ukraine Russia War,Ukraine war: Controversial referendums being h...,2022-09-24,Russian-backed officials in four occupied regi...,World,One man told the BBC he was in the queue to cr...,http://www.bbc.co.uk/news/live/world-63002591,59_russian_ukraine_ukrainian_war,"[russian, ukraine, ukrainian, war, soldiers, b...",russian - ukraine - ukrainian - war - soldiers...,False
5,Ukraine Russia War,Ukraine says mass burial sites found in retake...,2022-10-08,A Ukrainian official said two sites were found...,Europe,Destroyed vehicles remain in the city of Lyman...,http://www.bbc.co.uk/news/world-europe-63181475,59_russian_ukraine_ukrainian_war,"[russian, ukraine, ukrainian, war, soldiers, b...",russian - ukraine - ukrainian - war - soldiers...,False
6,Ukraine Russia War,As it happened: Zelensky tells Congress Ukrain...,2022-12-21,Ukraine's president gets a standing ovation fr...,US & Canada,A brief history of the war in Ukraine\n\nLet's...,http://www.bbc.co.uk/news/live/world-us-canada...,778_ukraine_ukraines_russian_russia,"[ukraine, ukraines, russian, russia, counterof...",ukraine - ukraines - russian - russia - counte...,False
7,Ukraine Russia War,Ukraine war: Ben Wallace says 97% of Russian a...,2023-02-14,The UK Defence Secretary says Putin's forces a...,Europe,The war in Ukraine will “likely” continue into...,http://www.bbc.co.uk/news/live/world-europe-64...,778_ukraine_ukraines_russian_russia,"[ukraine, ukraines, russian, russia, counterof...",ukraine - ukraines - russian - russia - counte...,False
8,Ukraine Russia War,Ukraine war latest news: Victory is inevitable...,2023-02-24,As the first German-made tanks arrive in Ukrai...,Europe,Irina Filkina was killed by Russian troops in ...,http://www.bbc.co.uk/news/live/world-europe-64...,59_russian_ukraine_ukrainian_war,"[russian, ukraine, ukrainian, war, soldiers, b...",russian - ukraine - ukrainian - war - soldiers...,False
9,Ukraine Russia War,Ukraine war latest: 'Extremely fierce battles'...,2023-06-14,Ukraine's ongoing counter-offensive has result...,Europe,Mystery surrounds the health of top Chechen co...,http://www.bbc.co.uk/news/live/world-europe-65...,778_ukraine_ukraines_russian_russia,"[ukraine, ukraines, russian, russia, counterof...",ukraine - ukraines - russian - russia - counte...,False


In [61]:
from datasets import Dataset

dataset_conflict_articles = Dataset.from_pandas(df_conflict_articles)

dataset_conflict_articles.push_to_hub("CarlosMorales/news_bbc_international_conflicts")


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/664 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/CarlosMorales/news_bbc_international_conflicts/commit/1f91842587a9ff636dfe19bdf39a5d46c79ae8a8', commit_message='Upload dataset', commit_description='', oid='1f91842587a9ff636dfe19bdf39a5d46c79ae8a8', pr_url=None, pr_revision=None, pr_num=None)

In [62]:
from datasets import load_dataset
dataset = load_dataset("CarlosMorales/news_bbc_international_conflicts")

Downloading readme:   0%|          | 0.00/661 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 39.7k/39.7k [00:00<00:00, 201kB/s]


Generating train split:   0%|          | 0/23 [00:00<?, ? examples/s]