# Topic Modeling with BERT - Analyze results
Int his notebook we analyze the created topics.

In [1]:
import os
import sys
import re
import glob
import numpy as np
import pandas as pd
import random
import langdetect
from langdetect import detect
from joblib import Parallel, delayed
from datetime import datetime, timedelta

# from bertopic import BERTopic
from bertopic._bertopic import BERTopic

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging

logging.basicConfig(level=logging.INFO)

import umap
import hdbscan
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import matplotlib.pyplot as plt

from _utils import *

import pickle
from pathlib import Path

### Global variables

In [2]:
PROJECT_PATH = Path("../../")
DATA_PATH = Path(PROJECT_PATH / "data")
RESULTS_PATH = Path(PROJECT_PATH / "results")

### Laod the data
The data is filtered to keep only last week tweets and we append Twitter metrics (number of counts, likes, retweets etc.) to the dataset. 

In [6]:
df = pd.read_csv(Path(DATA_PATH / "reduced_data.csv"))
df = df.reset_index(drop=True)

print(f"Data size: {df.shape}\n")
df.head()

Data size: (11002, 9)



Unnamed: 0,tweetLink,retweet_count,favorite_count,reply_count,quote_count,text,profileUrl,name,text_lengths
0,https://twitter.com/nigewillson/status/1396723...,10,12,0,0,New Artificial Intelligence System Can Make Yo...,https://twitter.com/nigewillson,Nige Willson,20
1,https://twitter.com/YvesMulkers/status/1396806...,1,1,0,0,Get the insights on #datagovernance #datamesh ...,https://twitter.com/YvesMulkers,Yves Mulkers,26
2,https://twitter.com/YvesMulkers/status/1396807...,1,2,0,0,#AI can accomplish incredible things when lead...,https://twitter.com/YvesMulkers,Yves Mulkers,32
3,https://twitter.com/jennwvaughan/status/139691...,1,35,1,1,"Ok, I literally shrieked with glee when I hear...",https://twitter.com/jennwvaughan,Jenn Wortman Vaughan,27
4,https://twitter.com/nigewillson/status/1396722...,10,6,0,0,Can Artificial Intelligence decode your dreams...,https://twitter.com/nigewillson,Nige Willson,22


## Load embeddings

In [7]:
# Load sentences & embeddings from disc
with open(Path(DATA_PATH / "embeddings.pkl"), "rb") as f:
    stored_data = pickle.load(f)
    docs = stored_data["sentences"]
    embeddings = stored_data["embeddings"]

print(f"Number of documents: {len(docs)}\n")
print(f"Embeddings type: {type(embeddings)}\nEmbeddings size: {embeddings.shape}")

Number of documents: 11002

Embeddings type: <class 'numpy.ndarray'>
Embeddings size: (11002, 768)


In [8]:
df["text_cleansed"] = docs
df[["text", "text_cleansed"]].head(20)

Unnamed: 0,text,text_cleansed
0,New Artificial Intelligence System Can Make Yo...,Artificial Intelligence Favorite Actors Speak ...
1,Get the insights on #datagovernance #datamesh ...,insights #datagovernance Revamp Analytics AI b...
2,#AI can accomplish incredible things when lead...,#AI accomplish incredible things leaders focus...
3,"Ok, I literally shrieked with glee when I hear...",literally shrieked glee heard received dissert...
4,Can Artificial Intelligence decode your dreams...,Artificial Intelligence decode dreams Artifici...
5,RT @nigewillson:New Artificial Intelligence Sy...,Artificial Intelligence Favorite Actors Speak ...
6,RT @nigewillson:Can Artificial Intelligence de...,Artificial Intelligence decode dreams Artifici...
7,Twitter Axing Artificial Intelligence Photo Cr...,Twitter Axing Artificial Intelligence Photo Cr...
8,The Future of Artificial Intelligence: Trends ...,Future Artificial Intelligence Trends Watch #ai
9,Can Your Enterprise Benefit from No-Code Artif...,Enterprise Benefit Code Artificial Intelligenc...


In [9]:
print(f"Num. of documents: {len(docs)}\n\n")

for i, d in enumerate(random.sample(list(docs), 3)):
    print(f"{i + 1}: '{d}'\n")

Num. of documents: 11002


1: "Join 2 Jul Revamp Analytics AI based Solutions Barry Devlin #datagovernance"

2: "Company #AI Institutional Review Board"

3: "#dataviz 47 000 debris orbit Source"



# **Create Topics**
We select the "english" as the main language for our documents. If you want a multilingual model that supports 50+ languages, please select "multilingual" instead. 

### Load model

In [10]:
topic_model = BERTopic.load(Path(RESULTS_PATH / "topic_model"))

### Get topics and probabilities from reloaded model

In [11]:
predictions, probabilities = topic_model.transform(docs, embeddings)

In [12]:
np.unique(predictions)

array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
        12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
        25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
        38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
        51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
        64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
        77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
       129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
       142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
       155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
       168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 17

### Topics list

In [13]:
topic_model.get_topics().keys()

dict_keys([-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 

## Topics importance
We rank the topics according to the number of "relevant" documents for them.

First, we use the built_in function provided in BERTopic.

help(BERTopic)

In [15]:
topic_model.get_topic_freq().head(11)

Unnamed: 0,Topic,Count
0,-1,2923
1,186,153
2,96,136
3,112,132
4,22,129
5,329,117
6,249,111
7,107,98
8,85,74
10,263,71


In [16]:
topic_model.get_topic_freq().tail(11)

Unnamed: 0,Topic,Count
313,169,10
312,205,10
311,192,10
310,60,10
309,6,10
308,18,10
307,46,10
306,36,10
305,41,10
304,327,10


## Relevant tweets per topic
We define a tweet as being relevant for a given topic if :
- This tweets is labeled we that topic, and
- That topic as a probability higher than a given threshold within this tweet (here we consider a threshold of 0.25)

Here we display randomly M relevant tweets for each of the top-K topics.

Now, we define **relevant documents given a topic** as being those for which that **topic has a probability above a user-defined threshold**.

In [17]:
top_K = 10
M = 10
topics = list(topic_model.get_topic_freq().Topic)[1 : top_K + 1]
topic_rank = 0

clusterer = topic_model.hdbscan_model
tree = clusterer.condensed_tree_
clusters = tree._select_clusters()
cluster = clusters[topics[topic_rank]]
c_exemplars = topic_model.get_most_relevant_documents(cluster, tree)

if len(c_exemplars) > 0:
    n = min(len(c_exemplars), M)
    samples = np.random.choice(c_exemplars, size=n, replace=False)

    print(f"Topic {topics[topic_rank]}:\n{topic_model.get_topic(topics[topic_rank])}\n")
    for idx in samples:
        print(f"Tweet ID: {idx}\n{docs[idx]}\nLink: {df.iloc[idx, 0]}\n")
    print(
        "\n**************************************************************************************************************************************\n"
    )
else:
    print(f"Topic {topics[topic_rank]}:\n{topic_model.get_topic(topics[topic_rank])}\n")
    print("No tweet related to current topic with significant probability.")

Topic 186:
[('2021', 0.11518344111464827), ('trends', 0.06973633657430857), ('trends 2021', 0.04791319519051584), ('2021 ai', 0.041789472437149734), ('companies', 0.03782389258944172), ('watch 2021', 0.03740412879855314), ('watch', 0.03687575794795414), ('iot', 0.031124820300891674), ('10', 0.029940307679780117), ('fintech', 0.027130500269414805), ('ai', 0.02533194981053806), ('leading tech', 0.024722970994140298), ('choose great', 0.024722970994140298), ('great wine', 0.024722970994140298), ('tech trends', 0.024722970994140298), ('intelligence movies', 0.024588839428707168), ('wine', 0.024252606974425835), ('leading', 0.024165918247990883), ('usa', 0.02414878068708062), ('movies', 0.024140236513879545), ('story', 0.02374750132534524), ('choose', 0.023238947782268815), ('artificial intelligence', 0.02319284370426363), ('intelligence', 0.02276603359764348), ('artificial', 0.02269160719355901), ('big data', 0.022121650224670896), ('list', 0.022062304794297955), ('50', 0.02188370781343243

## Export results all results

In [24]:
topics_ = list(topic_model.get_topic_freq().Topic)[1:]

exemplars_df = pd.DataFrame()
topics_exemplars = []
urls_exemplars = []
topic_counts_exemplars = []
keywords_exemplars = []
texts_exemplars = []

clusterer = topic_model.hdbscan_model
tree = clusterer.condensed_tree_
clusters = tree._select_clusters()

export_df = pd.DataFrame()
topics = []
urls = []
topic_counts = []
keywords = []
texts = []

for topic in topics_:
    cluster = clusters[topic]
    c_exemplars = topic_model.get_most_relevant_documents(cluster, tree)
    ids = np.asarray(np.asarray(predictions) == topic).nonzero()[0]
    topic_count = len(ids)

    for idx in c_exemplars:
        topics_exemplars.append(topic)
        urls_exemplars.append(df.iloc[idx, 0])
        topic_counts_exemplars.append(topic_count)
        keywords_exemplars.append(topic_model.get_topic(topic)[:20])
        texts_exemplars.append(df.iloc[idx, 5])

    for idx in ids:
        topics.append(topic)
        urls.append(df.iloc[idx, 0])
        topic_counts.append(topic_count)
        keywords.append(topic_model.get_topic(topic)[:20])
        texts.append(df.iloc[idx, 5])


keys = ["topic_keyword_" + str(i) for i in range(1, 21)]
cols_exemplars = [
    urls_exemplars,
    texts_exemplars,
    topics_exemplars,
    topic_counts_exemplars,
]
cols = [urls, texts, topics, topic_counts]
colnames = ["tweetLink", "text", "topic", "topic_count"]
keywords_exemplars_df = pd.DataFrame(keywords_exemplars, columns=keys)
keywords_df = pd.DataFrame(keywords, columns=keys)

for i in range(len(cols_exemplars)):
    col_exemplars_df = pd.DataFrame(cols_exemplars[i], columns=[colnames[i]])
    exemplars_df = pd.concat([exemplars_df, col_exemplars_df], axis=1)
    col_df = pd.DataFrame(cols[i], columns=[colnames[i]])
    export_df = pd.concat([export_df, col_df], axis=1)

exemplars_df = pd.concat([exemplars_df, keywords_exemplars_df], axis=1)
exemplars_df = exemplars_df.sort_values(["topic_count"], ascending=False)
exemplars_df = exemplars_df.reset_index(drop=True)

export_df = pd.concat([export_df, keywords_df], axis=1)
export_df = export_df.sort_values(["topic_count"], ascending=False)
export_df = export_df.reset_index(drop=True)

In [25]:
print(f"Output dataframe shape: {exemplars_df.shape}\n")
exemplars_df.head()

Output dataframe shape: (4038, 24)



Unnamed: 0,tweetLink,text,topic,topic_count,topic_keyword_1,topic_keyword_2,topic_keyword_3,topic_keyword_4,topic_keyword_5,topic_keyword_6,topic_keyword_7,topic_keyword_8,topic_keyword_9,topic_keyword_10,topic_keyword_11,topic_keyword_12,topic_keyword_13,topic_keyword_14,topic_keyword_15,topic_keyword_16,topic_keyword_17,topic_keyword_18,topic_keyword_19,topic_keyword_20
0,https://twitter.com/KayFButterfield/status/139...,RT @SethBergeson:📣STARTING NOW! #SmartToyAward...,186,153,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
1,https://twitter.com/terence_mills/status/13908...,RT @albertogaruccio: How much product room wil...,186,153,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
2,https://twitter.com/Fisher85M/status/139837012...,RT @ipfconline1:Here is a list of the top paym...,186,153,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
3,https://twitter.com/alvinfoo/status/1391524882...,RT @alvinfoo: Industry composition of US stock...,186,153,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
4,https://twitter.com/TamaraMcCleary/status/1394...,#IoT News’ list of innovative companies to wat...,186,153,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"


In [26]:
print(f"Output dataframe shape: {export_df.shape}\n")
export_df.head()

Output dataframe shape: (8079, 24)



Unnamed: 0,tweetLink,text,topic,topic_count,topic_keyword_1,topic_keyword_2,topic_keyword_3,topic_keyword_4,topic_keyword_5,topic_keyword_6,topic_keyword_7,topic_keyword_8,topic_keyword_9,topic_keyword_10,topic_keyword_11,topic_keyword_12,topic_keyword_13,topic_keyword_14,topic_keyword_15,topic_keyword_16,topic_keyword_17,topic_keyword_18,topic_keyword_19,topic_keyword_20
0,https://twitter.com/YvesMulkers/status/1396799...,"RT @CREWcrew:Mitch McConnell got $250,000 from...",186,153,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
1,https://twitter.com/freddy1876/status/13921717...,PyCon: PyCon 2021 Welcomes 8 Early-Stage Compa...,186,153,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
2,https://twitter.com/rschmelzer/status/13925902...,RT @cognilytica: Navy will push ahead with Pro...,186,153,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
3,https://twitter.com/rschmelzer/status/13925387...,RT @cognilytica: Pentagon leaders emphasize ro...,186,153,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
4,https://twitter.com/DeepLearn007/status/139247...,"RT @lyakovet: ""Where is #AI Going in 2022""\n\n...",186,153,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"


In [41]:
results_exemplars_df = exemplars_df.merge(df, how="left", on=["tweetLink", "text"])
results_df = export_df.merge(df, how="left", on=["tweetLink", "text"])

cols = [
    "tweetLink",
    "text",
    "retweet_count",
    "favorite_count",
    "reply_count",
    "quote_count",
    "topic",
    "topic_count",
    "profileUrl",
    "name",
]
cols = cols + [c for c in results_df.columns if "keyword" in c]

results_exemplars_df = results_exemplars_df[cols]
results_exemplars_df = results_exemplars_df.sort_values(
    ["topic_count"], ascending=False
)
results_exemplars_df = results_exemplars_df.reset_index(drop=True)

results_df = results_df[cols]
results_df = results_df.sort_values(["topic_count"], ascending=False)
results_df = results_df.reset_index(drop=True)

In [43]:
print(f"Output dataframe shape: {results_exemplars_df.shape}\n")
results_exemplars_df.head()

Output dataframe shape: (4038, 30)



Unnamed: 0,tweetLink,text,retweet_count,favorite_count,reply_count,quote_count,topic,topic_count,profileUrl,name,topic_keyword_1,topic_keyword_2,topic_keyword_3,topic_keyword_4,topic_keyword_5,topic_keyword_6,topic_keyword_7,topic_keyword_8,topic_keyword_9,topic_keyword_10,topic_keyword_11,topic_keyword_12,topic_keyword_13,topic_keyword_14,topic_keyword_15,topic_keyword_16,topic_keyword_17,topic_keyword_18,topic_keyword_19,topic_keyword_20
0,https://twitter.com/KayFButterfield/status/139...,RT @SethBergeson:📣STARTING NOW! #SmartToyAward...,2,0,0,0,186,153,https://twitter.com/KayFButterfield,KayFirth-Butterfield,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
1,https://twitter.com/pierrepinna/status/1395448...,RT @DeepLearn007:Leading Tech Trends 2021\n\nh...,126,0,0,0,186,153,https://twitter.com/pierrepinna,Pinna Pierre,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
2,https://twitter.com/terence_mills/status/13904...,RT @stratorob: Fintech: A Major Driving Force ...,14,0,0,0,186,153,https://twitter.com/terence_mills,Terence Mills 特伦斯米尔斯,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
3,https://twitter.com/DeepLearn007/status/139688...,RT @DeepLearn007:Leading Tech Trends 2021\n\nh...,126,0,0,0,186,153,https://twitter.com/DeepLearn007,AI,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
4,https://twitter.com/data_nerd/status/139585064...,Top Companies 2021: The 50 best workplaces to ...,0,0,0,0,186,153,https://twitter.com/data_nerd,Carla Gentry,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"


In [44]:
print(f"Output dataframe shape: {results_df.shape}\n")
results_df.head()

Output dataframe shape: (8079, 30)



Unnamed: 0,tweetLink,text,retweet_count,favorite_count,reply_count,quote_count,topic,topic_count,profileUrl,name,topic_keyword_1,topic_keyword_2,topic_keyword_3,topic_keyword_4,topic_keyword_5,topic_keyword_6,topic_keyword_7,topic_keyword_8,topic_keyword_9,topic_keyword_10,topic_keyword_11,topic_keyword_12,topic_keyword_13,topic_keyword_14,topic_keyword_15,topic_keyword_16,topic_keyword_17,topic_keyword_18,topic_keyword_19,topic_keyword_20
0,https://twitter.com/YvesMulkers/status/1396799...,"RT @CREWcrew:Mitch McConnell got $250,000 from...",1010,0,0,0,186,153,https://twitter.com/YvesMulkers,Yves Mulkers,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
1,https://twitter.com/data_nerd/status/139693655...,Madrona Venture Group’s Steve Singh on key tre...,0,0,0,0,186,153,https://twitter.com/data_nerd,Carla Gentry,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
2,https://twitter.com/TamaraMcCleary/status/1395...,2021 Internet Of Things 50: The Bright Lights ...,10,11,0,0,186,153,https://twitter.com/TamaraMcCleary,Tamara McCleary,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
3,https://twitter.com/data_nerd/status/139585064...,Top Companies 2021: The 50 best workplaces to ...,0,0,0,0,186,153,https://twitter.com/data_nerd,Carla Gentry,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"
4,https://twitter.com/Ronald_vanLoon/status/1396...,Whoever Leads In #ArtificialIntelligence In 20...,7,5,0,1,186,153,https://twitter.com/Ronald_vanLoon,Ronald van Loon,"(2021, 0.11518344111464827)","(trends, 0.06973633657430857)","(trends 2021, 0.04791319519051584)","(2021 ai, 0.041789472437149734)","(companies, 0.03782389258944172)","(watch 2021, 0.03740412879855314)","(watch, 0.03687575794795414)","(iot, 0.031124820300891674)","(10, 0.029940307679780117)","(fintech, 0.027130500269414805)","(ai, 0.02533194981053806)","(leading tech, 0.024722970994140298)","(choose great, 0.024722970994140298)","(great wine, 0.024722970994140298)","(tech trends, 0.024722970994140298)","(intelligence movies, 0.024588839428707168)","(wine, 0.024252606974425835)","(leading, 0.024165918247990883)","(usa, 0.02414878068708062)","(movies, 0.024140236513879545)"


In [45]:
results_exemplars_df.topic.value_counts()

186    42
85     35
96     34
112    32
128    27
       ..
218    10
216    10
214    10
127    10
335    10
Name: topic, Length: 336, dtype: int64

In [46]:
results_df.topic.value_counts()

186    153
96     136
112    132
22     129
329    117
      ... 
292     10
74      10
15      10
184     10
304     10
Name: topic, Length: 336, dtype: int64

### Save the results file

In [47]:
results_exemplars_df.to_csv(
    Path(RESULTS_PATH / "Topic Extraction - exemplars only.csv"), index=False
)

In [48]:
results_df.to_csv(Path(RESULTS_PATH / "Topic Extraction - all.csv"), index=False)