In [18]:
# import library

import os
import zipfile
import glob
import pandas as pd
import re

from nltk import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

from gensim import corpora
from gensim.models.ldamodel import LdaModel

!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [20]:
# unzip archive

zip = zipfile.ZipFile('bbc.zip')
zip.extractall()

In [21]:
# load classes from bbc dir
lst_dir = os.listdir('bbc')
lst_dir

['business', 'entertainment', 'politics', 'tech', 'sport']

In [22]:
# create data frame
texts = []
labels = []
for topic in lst_dir:
  for file in glob.glob(os.path.join(os.getcwd(), 'bbc', topic, '*.txt')):
    with open (file, encoding='utf-8', mode='r', errors='ignore') as f:
      texts.append(' '.join(f.read().splitlines()))
      labels.append(topic)

In [23]:
df = pd.DataFrame({'text': texts, 'label': labels})

In [24]:
df.head()

Unnamed: 0,text,label
0,Ukraine revisits state sell-offs Ukraine is p...,business
1,"Asia shares defy post-quake gloom Indonesian,...",business
2,Brussels raps mobile call charges The Europea...,business
3,Low-cost airlines hit Eurotunnel Channel Tunn...,business
4,US Ahold suppliers face charges US prosecutor...,business


In [25]:
# delete duplicate news
df = df.drop_duplicates()

In [26]:
df.head()

Unnamed: 0,text,label
0,Ukraine revisits state sell-offs Ukraine is p...,business
1,"Asia shares defy post-quake gloom Indonesian,...",business
2,Brussels raps mobile call charges The Europea...,business
3,Low-cost airlines hit Eurotunnel Channel Tunn...,business
4,US Ahold suppliers face charges US prosecutor...,business


In [27]:
# function for cleaning text

def text_transform(texts, stopwords):

  lemmatizer = WordNetLemmatizer()
  # stemmer = PorterStemmer()
  word_lst = word_tokenize(re.sub('[^a-zA-Z]', ' ', texts.lower()))
  lst = ['v', 'a', 'r', 's', 'n']
  out = []
  for word in word_lst:
    for pos in lst:
      word = lemmatizer.lemmatize(word, pos=pos)
    if len(word) >= 2:
      out.append(word)
  out = [word for word in out if word not in stopwords]
  # out = [stemmer.stem(word) for word in word_lst if word not in stopwords ] # [word for word in word_lst if word not in stopwords ]
  
  return ' '.join(out)

In [28]:
# stopwords from nltk and additional words
stop_words = set(stopwords.words('english') + ['say', 'mr', 'would', 'could', 'also', 'could', 'mr', 'bn', 'gm'])

In [29]:
# cleaning text
df['clean_text'] = df.text.apply(text_transform, stopwords=stop_words)

In [30]:
df.head()

Unnamed: 0,text,label,clean_text
0,Ukraine revisits state sell-offs Ukraine is p...,business,ukraine revisit state sell ukraine prepare who...
1,"Asia shares defy post-quake gloom Indonesian,...",business,asia share defy post quake gloom indonesian in...
2,Brussels raps mobile call charges The Europea...,business,brussels rap mobile call charge european commi...
3,Low-cost airlines hit Eurotunnel Channel Tunn...,business,low cost airline hit eurotunnel channel tunnel...
4,US Ahold suppliers face charges US prosecutor...,business,ahold supplier face charge prosecutor charge n...


In [31]:
# function for analys topic

def LDA_analys(df, topic, num_topics=5, iterations=20, passes=300):
  # choose all topics or separete
  if topic != 'ALL':
    texts = df[df.label == topic].clean_text.str.split().to_list()
  else:
    texts = df.clean_text.str.split().to_list()

  dictionary = corpora.Dictionary(texts)
  corpus = [dictionary.doc2bow(text) for text in texts]
  transcripts_topics = LdaModel(corpus=corpus,
                           id2word=dictionary,
                           num_topics=num_topics,
                           iterations = iterations, 
                           eta='auto',
                           alpha='auto', ##Learns an asymmetric prior from the corpus;
##1D array of length=number of expected topics that expresses our a-priori belief for the each topics’ probability.
                           passes=passes ##Number of passes through the corpus during training.
                             ) ;
  print(transcripts_topics)
  # print out first 5 topics
  for i, topic in enumerate(transcripts_topics.print_topics(5)):
      print ('%d: %s\n'%(i+1, topic))
  return transcripts_topics, corpus, dictionary


In [32]:
# politics
topic, corpus, dictionary = LDA_analys(df, 'politics')
vis_data = gensimvis.prepare(topic, corpus, dictionary)
pyLDAvis.display(vis_data)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logs

LdaModel(num_terms=7654, num_topics=5, decay=0.5, chunksize=2000)
1: (0, '0.010*"government" + 0.010*"tax" + 0.007*"election" + 0.007*"labour" + 0.006*"people" + 0.006*"plan" + 0.006*"tory" + 0.006*"brown" + 0.006*"year" + 0.006*"party"')

2: (1, '0.010*"party" + 0.008*"people" + 0.007*"ukip" + 0.006*"kilroy" + 0.006*"election" + 0.006*"minister" + 0.006*"silk" + 0.005*"government" + 0.004*"new" + 0.004*"uk"')

3: (2, '0.008*"government" + 0.007*"plan" + 0.007*"people" + 0.006*"year" + 0.005*"new" + 0.005*"make" + 0.005*"tory" + 0.004*"party" + 0.004*"labour" + 0.004*"take"')

4: (3, '0.007*"government" + 0.006*"people" + 0.006*"police" + 0.006*"law" + 0.006*"home" + 0.006*"lord" + 0.006*"minister" + 0.005*"right" + 0.005*"make" + 0.005*"new"')

5: (4, '0.015*"labour" + 0.011*"party" + 0.011*"blair" + 0.010*"election" + 0.008*"minister" + 0.007*"tory" + 0.006*"people" + 0.006*"government" + 0.006*"tell" + 0.005*"campaign"')



  by='saliency', ascending=False).head(R).drop('saliency', 1)


Most popular topics in politics are election in UK's goverment, Tony Blair (labour party), tory party.

In [33]:
# tech
topic, corpus, dictionary = LDA_analys(df, 'tech')
vis_data = gensimvis.prepare(topic, corpus, dictionary)
pyLDAvis.display(vis_data)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logs

LdaModel(num_terms=8274, num_topics=5, decay=0.5, chunksize=2000)
1: (0, '0.010*"use" + 0.008*"people" + 0.007*"technology" + 0.007*"make" + 0.006*"dvd" + 0.005*"net" + 0.005*"computer" + 0.005*"get" + 0.005*"system" + 0.005*"work"')

2: (1, '0.011*"use" + 0.008*"search" + 0.008*"people" + 0.007*"user" + 0.006*"microsoft" + 0.005*"make" + 0.005*"software" + 0.005*"one" + 0.005*"computer" + 0.005*"new"')

3: (2, '0.008*"use" + 0.007*"people" + 0.006*"mail" + 0.006*"one" + 0.006*"file" + 0.006*"make" + 0.005*"computer" + 0.005*"software" + 0.005*"new" + 0.005*"firm"')

4: (3, '0.031*"game" + 0.007*"play" + 0.006*"people" + 0.006*"make" + 0.006*"year" + 0.006*"new" + 0.005*"get" + 0.005*"use" + 0.005*"time" + 0.005*"one"')

5: (4, '0.014*"mobile" + 0.012*"people" + 0.012*"phone" + 0.010*"use" + 0.008*"technology" + 0.008*"service" + 0.007*"tv" + 0.006*"year" + 0.006*"digital" + 0.006*"broadband"')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  token_table['Freq'] = token_table['Freq'].round()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  token_table['Term'] = vocab[token_table.index.values].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  token_table['Freq'] = token_table.Freq / term_frequency[token_table.index]


MOst popular topics in tech are technology, mobile phone, game (game console).

In [34]:
# sport
topic, corpus, dictionary = LDA_analys(df, 'sport')
vis_data = gensimvis.prepare(topic, corpus, dictionary)
pyLDAvis.display(vis_data)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logs

LdaModel(num_terms=7745, num_topics=5, decay=0.5, chunksize=2000)
1: (0, '0.009*"year" + 0.008*"test" + 0.008*"take" + 0.007*"drug" + 0.006*"win" + 0.006*"ban" + 0.006*"athens" + 0.005*"sport" + 0.005*"world" + 0.005*"game"')

2: (1, '0.010*"player" + 0.010*"club" + 0.008*"game" + 0.007*"play" + 0.007*"go" + 0.007*"year" + 0.006*"want" + 0.006*"get" + 0.006*"chelsea" + 0.006*"make"')

3: (2, '0.014*"win" + 0.011*"year" + 0.009*"world" + 0.007*"race" + 0.007*"time" + 0.007*"go" + 0.007*"second" + 0.006*"european" + 0.006*"new" + 0.006*"champion"')

4: (3, '0.013*"win" + 0.012*"play" + 0.009*"year" + 0.009*"first" + 0.008*"game" + 0.008*"open" + 0.007*"match" + 0.007*"final" + 0.006*"time" + 0.006*"set"')

5: (4, '0.013*"england" + 0.008*"game" + 0.008*"wale" + 0.008*"ireland" + 0.008*"win" + 0.007*"half" + 0.006*"six" + 0.006*"back" + 0.006*"side" + 0.006*"first"')



Most popular topics in sport are Olimpic game, team (England, Ireland, Scotland), football (chelsea, Arsenal, Liverpool. championship), rugby.

In [35]:
# business
topic, corpus, dictionary = LDA_analys(df, 'business')
vis_data = gensimvis.prepare(topic, corpus, dictionary)
pyLDAvis.display(vis_data)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logs

LdaModel(num_terms=8051, num_topics=5, decay=0.5, chunksize=2000)
1: (0, '0.008*"year" + 0.008*"company" + 0.006*"firm" + 0.006*"new" + 0.005*"share" + 0.005*"make" + 0.005*"business" + 0.004*"bank" + 0.004*"deal" + 0.004*"country"')

2: (1, '0.014*"year" + 0.010*"rise" + 0.009*"growth" + 0.008*"economy" + 0.008*"bank" + 0.008*"market" + 0.006*"price" + 0.006*"month" + 0.006*"sale" + 0.005*"economic"')

3: (2, '0.013*"year" + 0.007*"dollar" + 0.007*"market" + 0.006*"price" + 0.006*"sale" + 0.005*"company" + 0.005*"profit" + 0.005*"oil" + 0.005*"new" + 0.005*"euro"')

4: (3, '0.011*"company" + 0.009*"firm" + 0.008*"yukos" + 0.006*"year" + 0.005*"oil" + 0.005*"government" + 0.005*"state" + 0.005*"court" + 0.004*"russian" + 0.004*"market"')

5: (4, '0.007*"year" + 0.007*"company" + 0.006*"firm" + 0.006*"market" + 0.006*"share" + 0.005*"make" + 0.005*"country" + 0.004*"india" + 0.004*"bank" + 0.004*"china"')



MOst popular topics in business are Yukos (oil), economic, goverment.

In [36]:
# entertainment
topic, corpus, dictionary = LDA_analys(df, 'entertainment')
vis_data = gensimvis.prepare(topic, corpus, dictionary)
pyLDAvis.display(vis_data)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logs

LdaModel(num_terms=8611, num_topics=5, decay=0.5, chunksize=2000)
1: (0, '0.021*"film" + 0.010*"year" + 0.006*"award" + 0.006*"make" + 0.006*"show" + 0.006*"star" + 0.005*"win" + 0.005*"one" + 0.005*"new" + 0.005*"best"')

2: (1, '0.010*"show" + 0.010*"year" + 0.009*"best" + 0.007*"music" + 0.006*"film" + 0.006*"award" + 0.006*"song" + 0.005*"one" + 0.005*"star" + 0.005*"new"')

3: (2, '0.015*"film" + 0.009*"year" + 0.007*"award" + 0.006*"star" + 0.006*"best" + 0.006*"show" + 0.005*"win" + 0.005*"make" + 0.005*"new" + 0.005*"include"')

4: (3, '0.011*"music" + 0.009*"year" + 0.007*"award" + 0.007*"one" + 0.007*"best" + 0.007*"album" + 0.006*"show" + 0.006*"film" + 0.005*"band" + 0.005*"win"')

5: (4, '0.017*"film" + 0.013*"best" + 0.010*"award" + 0.010*"year" + 0.008*"star" + 0.008*"win" + 0.006*"take" + 0.006*"oscar" + 0.006*"actor" + 0.005*"make"')



Popular topics are films, music, stars, tv, Oscar

In [37]:
# All topics
topic, corpus, dictionary = LDA_analys(df, 'ALL', num_topics=15, iterations=30, passes=500)
vis_data = gensimvis.prepare(topic, corpus, dictionary)
pyLDAvis.display(vis_data)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logs

LdaModel(num_terms=20755, num_topics=15, decay=0.5, chunksize=2000)
1: (12, '0.014*"car" + 0.010*"beattie" + 0.008*"attack" + 0.008*"last" + 0.008*"year" + 0.007*"win" + 0.007*"robot" + 0.007*"everton" + 0.007*"assault" + 0.006*"add"')

2: (8, '0.012*"ball" + 0.010*"jam" + 0.009*"unite" + 0.008*"chance" + 0.008*"chelsea" + 0.008*"rooney" + 0.007*"get" + 0.007*"duff" + 0.007*"wide" + 0.007*"leave"')

3: (10, '0.011*"year" + 0.009*"drug" + 0.009*"test" + 0.009*"world" + 0.008*"sport" + 0.007*"ban" + 0.007*"athletics" + 0.007*"team" + 0.006*"take" + 0.006*"make"')

4: (6, '0.013*"england" + 0.010*"ireland" + 0.010*"side" + 0.009*"game" + 0.009*"player" + 0.008*"new" + 0.008*"rugby" + 0.008*"win" + 0.007*"try" + 0.006*"zealand"')

5: (7, '0.008*"year" + 0.007*"government" + 0.004*"people" + 0.004*"make" + 0.004*"labour" + 0.004*"new" + 0.004*"plan" + 0.004*"minister" + 0.004*"election" + 0.004*"party"')

