In [27]:
import pickle
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import text2emotion as te
from bertopic import BERTopic

In [3]:
import spacy
from spacy import displacy
import spacy_transformers
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [62]:
df=pd.read_csv("bbc-news-data.csv",sep="\t")
df=df[:700]

In [63]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/animeshsengupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/animeshsengupta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/animeshsengupta/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [64]:
def clean_text(x):
  x = str(x)
  x = x.lower()
  x = re.sub(r'#[A-Za-z0-9]*', ' ', x)
  x = re.sub(r'https*://.*', ' ', x)
  x = re.sub(r'@[A-Za-z0-9]+', ' ', x)
  tokens = word_tokenize(x)
  x = ' '.join([w for w in tokens if not w.lower() in stop_words])
  x = re.sub(r'[%s]' % re.escape('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~“…”’'), ' ', x)
  x = re.sub(r'\d+', ' ', x)
  x = re.sub(r'\n+', ' ', x)
  x = re.sub(r'\s{2,}', ' ', x)
  return x

In [65]:
ner = spacy.load('en_core_web_sm')

In [66]:
def extract_ner(df):
    df["Named_entities"]=ner(clean_text(df["description"]))
    return df



In [67]:
df['clean_text'] = df.content.apply(clean_text)

In [68]:
df['clean_ner']=df.clean_text.apply(ner)

In [69]:

s=df.clean_text[1]
print(te.get_emotion(s))

df["emotions"]=df.clean_text.apply(te.get_emotion)

#dfn['emotions']=dfn.clean_ner.apply(get_emotion)

{'Happy': 0.09, 'Angry': 0.09, 'Surprise': 0.14, 'Sad': 0.14, 'Fear': 0.53}


In [73]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.1,shuffle=True)


In [74]:
articles=train.clean_text.to_list()

In [110]:
topic_model = BERTopic(language="english")
topics, probs = topic_model.fit_transform(articles)

In [111]:
df1 = pd.DataFrame({"Document": articles, "Topic": [topic_model.get_topic(topic) for topic in topics],"probs":probs})
df1.head(10)

Unnamed: 0,Document,Topic,probs
0,us dollar hit new record low euro analysts pre...,"[(said, 0.025751953916270436), (bn, 0.01915603...",0.0
1,arsenal vice chairman david dein said club may...,"[(deutsche, 0.051682025531151754), (boerse, 0....",0.549169
2,vodafone drafted uk chief executive william mo...,"[(mci, 0.04976586996510854), (bn, 0.0385368445...",1.0
3,iraq invite bids two telephone licences saying...,"[(said, 0.025751953916270436), (bn, 0.01915603...",0.0
4,open society institute osi financed billionair...,"[(yukos, 0.08373601821774515), (russian, 0.056...",0.817319
5,industrial commercial bank icbc china 's bigge...,"[(said, 0.025751953916270436), (bn, 0.01915603...",0.0
6,general motors world 's largest car maker conf...,"[(car, 0.06464562207029172), (gm, 0.0642083551...",0.960924
7,britannia building society members receive pr...,"[(deutsche, 0.051682025531151754), (boerse, 0....",0.545529
8,last film made slain dutch director theo van g...,"[(film, 0.0681689947586418), (best, 0.04199765...",1.0
9,bt moved pre empt possible break up business o...,"[(mci, 0.04976586996510854), (bn, 0.0385368445...",1.0


In [121]:
train["topics"]=[dict(topic_model.get_topic(topic)) for topic in topics]

train["All_topics"]=train.topics.apply(lambda x:list(x.keys()))

In [122]:
train.head(10)

Unnamed: 0,category,filename,title,content,clean_text,clean_ner,emotions,topics,All_topics
375,business,376.txt,Dollar slides ahead of New Year,The US dollar has hit a new record low agains...,us dollar hit new record low euro analysts pre...,"(us, dollar, hit, new, record, low, euro, anal...","{'Happy': 0.11, 'Angry': 0.04, 'Surprise': 0.0...","{'said': 0.025751953916270436, 'bn': 0.0191560...","[said, bn, us, year, china, bank, new, dollar,..."
270,business,271.txt,Arsenal 'may seek full share listing',Arsenal vice-chairman David Dein has said the...,arsenal vice chairman david dein said club may...,"(arsenal, vice, chairman, david, dein, said, c...","{'Happy': 0.03, 'Angry': 0.0, 'Surprise': 0.08...","{'deutsche': 0.051682025531151754, 'boerse': 0...","[deutsche, boerse, lse, bid, club, euronext, l..."
233,business,234.txt,Vodafone appoints new Japan boss,Vodafone has drafted in its UK chief executiv...,vodafone drafted uk chief executive william mo...,"(vodafone, drafted, uk, chief, executive, will...","{'Happy': 0.14, 'Angry': 0.0, 'Surprise': 0.0,...","{'mci': 0.04976586996510854, 'bn': 0.038536844...","[mci, bn, phone, company, verizon, mobile, sai..."
128,business,129.txt,Iraq to invite phone licence bids,Iraq is to invite bids for two telephone lice...,iraq invite bids two telephone licences saying...,"(iraq, invite, bids, two, telephone, licences,...","{'Happy': 0.06, 'Angry': 0.0, 'Surprise': 0.0,...","{'said': 0.025751953916270436, 'bn': 0.0191560...","[said, bn, us, year, china, bank, new, dollar,..."
356,business,357.txt,Soros group warns of Kazakh close,"The Open Society Institute (OSI), financed by...",open society institute osi financed billionair...,"(open, society, institute, osi, financed, bill...","{'Happy': 0.09, 'Angry': 0.15, 'Surprise': 0.1...","{'yukos': 0.08373601821774515, 'russian': 0.05...","[yukos, russian, gazprom, russia, oil, rosneft..."
395,business,396.txt,Profits jump at China's top bank,"Industrial and Commercial Bank (ICBC), China'...",industrial commercial bank icbc china 's bigge...,"(industrial, commercial, bank, icbc, china, 's...","{'Happy': 0.06, 'Angry': 0.0, 'Surprise': 0.12...","{'said': 0.025751953916270436, 'bn': 0.0191560...","[said, bn, us, year, china, bank, new, dollar,..."
148,business,149.txt,Saab to build Cadillacs in Sweden,"General Motors, the world's largest car maker...",general motors world 's largest car maker conf...,"(general, motors, world, 's, largest, car, mak...","{'Happy': 0.08, 'Angry': 0.0, 'Surprise': 0.09...","{'car': 0.06464562207029172, 'gm': 0.064208355...","[car, gm, fiat, sales, vehicles, nissan, said,..."
236,business,237.txt,Britannia members' £42m windfall,"More than 800,000 Britannia Building Society ...",britannia building society members receive pr...,"( , britannia, building, society, members, rec...","{'Happy': 0.05, 'Angry': 0.0, 'Surprise': 0.14...","{'deutsche': 0.051682025531151754, 'boerse': 0...","[deutsche, boerse, lse, bid, club, euronext, l..."
566,entertainment,057.txt,Dutch watch Van Gogh's last film,The last film to be made by the slain Dutch d...,last film made slain dutch director theo van g...,"(last, film, made, slain, dutch, director, the...","{'Happy': 0.1, 'Angry': 0.0, 'Surprise': 0.33,...","{'film': 0.0681689947586418, 'best': 0.0419976...","[film, best, actor, films, director, awards, a..."
251,business,252.txt,BT offers equal access to rivals,BT has moved to pre-empt a possible break-up ...,bt moved pre empt possible break up business o...,"(bt, moved, pre, empt, possible, break, up, bu...","{'Happy': 0.12, 'Angry': 0.01, 'Surprise': 0.1...","{'mci': 0.04976586996510854, 'bn': 0.038536844...","[mci, bn, phone, company, verizon, mobile, sai..."


In [123]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,166,-1_said_bn_us_year
1,0,76,0_film_best_actor_films
2,1,57,1_music_album_band_chart
3,2,48,2_said_government_countries_economic
4,3,34,3_deutsche_boerse_lse_bid
5,4,29,4_growth_germany_eu_european
6,5,29,5_mr_ebbers_fraud_worldcom
7,6,28,6_yukos_russian_gazprom_russia
8,7,27,7_airline_air_airlines_airbus
9,8,26,8_mci_bn_phone_company


In [127]:
train.emotions.iloc[10]

{'Happy': 0.09, 'Angry': 0.09, 'Surprise': 0.3, 'Sad': 0.11, 'Fear': 0.41}