### Topic modeling extended (manual grouping of topics)

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize

In [82]:
news = pd.read_csv('results/news.csv')
news_text = news.text

In [100]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

stop_words = set(stopwords.words('english'))

exclude_words = ['donald', 'trump', 'joe', 'biden', 'president', 'ad']

def preprocess(txt):
    
    proc_sent = []
    for line in txt:
        word_tokens = word_tokenize(line)
        sent = [w for w in word_tokens if (not w.lower() in stop_words and not w.lower() in exclude_words)]
        proc_sent.append(" ".join(sent))
    
    return proc_sent

In [101]:
news['processed_text'] = preprocess(news_text)

In [20]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
seed_topic_list = [['drug', 'outbreak', 'flu', 'infection', 'contagious', 'treatment', 'prescription', 'covid', 'test', 'virus', 'ventilator', 'deaths', 'cases', 'pandemic', 
                    'epidemic', 'corona', 'coronavirus', 'covid19', 'patients', 'symptom'],
                   ['lockdown', 'shutdown', 'mask', 'distancing', 'masks'],
                   ['vaccine', 'vaccination', 'cure', 'moderna', 'pfizer', 'astrazeneca', 'j&j', 'dose'],
                   ['education', 'school', 'student', 'teacher', 'children', 'homeschool', 'schools', 'students', 'teachers'],
                   ['economy', 'industry', 'business', 'financial', 'finance', 'fiscal', 'economic', 'job', 'jobless', 'investing', 'investor', 'billion', 'gdp', 'debt', 
                    'liquidity', 'inflation', 'stimulus', 'bill', 'stocks', 'market', 'employment', 'unemployment', 'checks', 'cheques', 'recession', 'bull', 'bullish', 'bear',
                    'bearish', 'dow', 's&p', 'nasdaq', 'trade', 'trading', 'tax', 'loan', 'labor', 'buyback', 'selloff', 'wealth', 'wealthy', 'billionare', 'millionare'],
                   ['earth', 'green', 'pollution', 'ozone', 'deforestation', 'greenhouse', 'wildfire', 'climate', 'warming', 'temperature', 'flood', 'drought', 'glacier', 'environment', 'environmental', 'carbon', 'emission', 'gas', 'fracking'],
                   ['capitol', 'riot', 'siege', 'rioter', 'mob'],
                   ['voter', 'absentee', 'ballot', 'fraud', 'mailin', 'stolen', 'voting', 'election', 'black voters'],
                   ['immigration', 'immigrant', 'refugee', 'border', 'wall', 'migration', 'h1b', 'visa'],
                   ['blm', 'floyd', 'police', 'brutality', 'defund', 'protest', 'protesters', 'officer', 'black lives matter', 'injustice', 'racism', 'racial', 'supremacist'],
                   ['abortion', 'wade', 'roe'],
                   ['supreme', 'court', 'coney', 'barret', 'packing', 'justice', 'judge'],
                   ['security', 'military', 'weapons', 'attack', 'defense', 'gun', 'shooting', 'pentagon'],
                   ['international', 'country', 'global', 'china', 'chinese', 'beijing', 'shanghai', 'iran', 'irani', 'iranian', 'tehran', 'afghanistan', 'afghan', 'afghani', 'afghanistani', 
                    'kabul', 'russia', 'russian', 'moscow', 'britain', 'british', 'brit', 'brexit', 'london', 'Korea', 'Korean', 'kim', 'venezuelan', 'venezuela', 'syrian', 'syria'
                    'world', 'worldwide']]

In [167]:
my_stopwords = frozenset(list(["rt","RT", "&", "amp", "&amp", "http","https", "http://", "https://", "fav", "FAV"]))
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words = my_stopwords, min_df=5)

# do the BERT topic modelling
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

min_clusters = round(len(headlines) * 0.0017)
hdbscan_model = HDBSCAN(min_cluster_size= min_clusters, metric='euclidean', cluster_selection_method='eom', prediction_data=True, min_samples=5)

# sentence_model = SentenceTransformer("all-mpnet-base-v2")
# embeddings = sentence_model.encode(news_list)

#run the model
topic_model = BERTopic(seed_topic_list = seed_topic_list, nr_topics = 'auto', umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, low_memory=True, calculate_probabilities=True)

topics, probs = topic_model.fit_transform(news_text)

In [2]:
from bertopic import BERTopic
import pickle
#Clear cache or restart kernel
my_model = BERTopic.load("../../bertopic/topic_model")

# topic_model.save("bertopic/topic_model_news_tweets")
# pickle.dump(topics, open( "bertopic/topics_news_tweets.pickle", "wb" ) )
# # assert topics == new_topics

In [3]:
my_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,7930,-1_said_election_coronavirus_people
1,0,2785,0_election_ballots_votes_voting
2,1,2219,1_twitter_facebook_social media_media
3,2,1894,2_coronavirus_virus_cases_pandemic
4,3,1577,3_capitol_impeachment_inauguration_house
...,...,...,...
74,73,98,73_latino_latinos_florida_voters
75,74,96,74_qanon_conspiracy_theory_greene
76,75,78,75_trade_manufacturing_jobs_workers
77,76,78,76_says_pm_sessions_2020


In [190]:
# import pickle

# topic_model.save("bertopic/topic_model")
# pickle.dump(topics, open( "bertopic/topics.pickle", "wb" ) )
# # assert topics == new_topics

  self._set_arrayXarray(i, j, x)


#### Merging similar subtopics and grouping based on similarity after manual inspection

In [380]:
topic_list = {'election': {'fraud': [0], 'polls': [22, 25], 'black voters': [33], 'georgia_runoff': [31], 'general': [23]},
             'social_media': {'general': [1], 'tiktok': [45]},
             'covid': {'general': [2], 'cases': [6], 'precaution': [15], 'vaccine': [21], 'person': [37], 'drugs': [39], 'testing': [51], 'schools': [59, 63], 'equipments': [72]},
             'capitol': {'general': [3]},
             'floyd': {'general': [4]},
             'court': {'general': [5]},
             'immigration': {'general': [8]},
              'hunter_biden': {'general': [10]},
              'pres_debate': {'general': [11]},
              'security': {'general': [12, 52]},
              'climate': {'general': [14]},
              'economy': {'general': [75], 'stimulus': [16], 'tax': [30], 'market': [42]},
              'internatinal': {'general': [77], 'china': [7], 'russia': [17], 'britain': [29], 'israel': [47], 'iran': [57]},
              'democrats': {'kamala': [9], 'sanders': [26], 'obama': [34], 'cuomo': [46], 'pelosi': [49]}, 
              'republican': {'convention': [13], 'first_lady': [35], 'pence': [61]},
              'abortion': {'general': [41]},
              'healthcare': {'general' : [50]},
              'proud boys': {'general': [71], 'qanon': [74]},
              'other': {'general': [18, 19, 20, 24, 27, 28, 32, 36, 38, 40, 43, 44, 48, 53, 54, 55, 56, 58, 60, 62, 64, 65, 66, 67, 68, 69, 70, 73, 76]},
             }

In [381]:
### topic Ids
topic_cat_dic = dict([(j, i) for i, j in zip([key for key in topic_list], [x for x in range(len(topic_list))])])

### topic to cluster id mapping
topic_cls_map = dict()
for key in topic_list:
    cls_ids = []
    for k in topic_list[key]:
        cls_ids.extend(topic_list[key][k])
    topic_cls_map[key] =  cls_ids
    
cls_to_fine_top = dict()
for top in topic_list:
    for t in topic_list[top]:
        for k in topic_list[top][t]:
            cls_to_fine_top[k] = top + "-" + t
            
            
tops, fine_top = [], []
for t in topics:
    
    if(t == -1):
        tops.append('none')
        fine_top.append('none')
        continue
    for k in topic_cls_map:
        if(t in topic_cls_map[k]):
            tops.append(k)
    fine_top.append(cls_to_fine_top[t])

In [382]:
len(tops), len(topics), len(fine_top)

(43816, 43816, 43816)

In [383]:
news['topic_ids'] = topics
news['topics'] = tops
news['subtopic'] = fine_top

In [387]:
news.topics.value_counts()

none            7930
other           7457
covid           5265
election        4986
social_media    2457
internatinal    2392
democrats       2039
capitol         1577
floyd           1386
economy         1305
court           1277
republican      1235
security         909
immigration      817
hunter_biden     737
pres_debate      710
climate          655
abortion         255
healthcare       215
proud boys       212
Name: topics, dtype: int64

In [384]:
news.to_csv('dataset/news.csv')

In [386]:
# pd.options.display.max_colwidth = 1000
# news[['title', 'publication', 'topics', 'subtopic']].head(100)