In [4]:
# Libraries
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import re
from io import StringIO
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
import spacy
from keybert import KeyBERT
import pickle
import seaborn as sns

In [5]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alber\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alber\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Define additional words to remove, including 'nan' for empty cells
custom_words_to_remove = ['nan']

# Function to preprocess text
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and additional custom words
    words_filtered = [word for word in tokens if word not in custom_words_to_remove]
    return " ".join(words_filtered)

In [7]:
# Preprocess the Dataset

# Define the path to your original CSV file
file_path = r'C:\Users\alber\Desktop\BERTopic\MyData-M6 N2780.txt'

# Directly read the tab-delimited file, specifying 'PY', 'TI', and 'AB' columns for timestamp, title, and abstract
df = pd.read_csv(file_path, sep='\t', usecols=['PY', 'TI', 'AB'])

# Combine 'TI' (Title) and 'AB' (Abstract) into a single text column
df['text'] = df['TI'].astype(str) + " " + df['AB'].astype(str)

# Apply text preprocessing
df['clean_text'] = df['text'].apply(preprocess_text)

# Convert 'PY' to datetime, assuming 'PY' is the year
df['timestamp'] = pd.to_datetime(df['PY'], format='%Y', errors='coerce')

# Filter out rows where timestamp is NaT/NaN
df = df.dropna(subset=['timestamp'])

# Prepare the timestamps for topic modeling
timestamps = df['timestamp'].tolist()

In [8]:
print(df['clean_text'].head())

0    No Uber , No Tourists ? Public Attitudes towar...
1    Promises and paradoxes of the sharing economy ...
2    Mapping the Sharing Economy in China Harnessin...
3    Sharing economy as a field : Revisiting debate...
4    Sharing Economy : For or against Sustainable D...
Name: clean_text, dtype: object


In [9]:
# Print the total number of rows in the 'clean_text' column
print("Total number of rows in 'clean_text':", df['clean_text'].shape[0])

Total number of rows in 'clean_text': 2780


In [10]:
# Pre-calculate embeddings
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = embedding_model.encode(df['clean_text'].to_list(), show_progress_bar=True)

Batches: 100%|██████████| 87/87 [04:17<00:00,  2.96s/it]


In [11]:
umap_model = UMAP(n_neighbors=20, n_components=10, min_dist=0.0, metric='cosine', random_state=42)

In [12]:
hdbscan_model = HDBSCAN(min_cluster_size=50, min_samples=5,
                        metric='euclidean',
                        cluster_selection_method='eom',
                        gen_min_span_tree=True,
                        prediction_data=True)

In [13]:
stopwords = list(stopwords.words('english')) + ['http', 'https', 'amp', 'com', 'thereof']
vectorizer_model = CountVectorizer(stop_words=stopwords, min_df=1, ngram_range=(1, 2))

In [14]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [15]:
# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT-3.5
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
#client = openai.OpenAI(api_key="sk-...")
#openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    # "OpenAI": openai_model,  # Uncomment if you will use OpenAI
    "MMR": mmr_model,
    "POS": pos_model
}

In [16]:
topic_model = BERTopic(
  # Pipeline models
  embedding_model=embedding_model,
  ctfidf_model=ctfidf_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,
  verbose=True,
  nr_topics=5
)

topics, probs = topic_model.fit_transform(df['clean_text'].to_list(), embeddings)

2024-07-25 17:28:12,909 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-25 17:28:33,498 - BERTopic - Dimensionality - Completed ✓
2024-07-25 17:28:33,500 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-25 17:28:33,605 - BERTopic - Cluster - Completed ✓
2024-07-25 17:28:33,606 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-25 17:28:40,990 - BERTopic - Representation - Completed ✓
2024-07-25 17:28:40,993 - BERTopic - Topic reduction - Reducing number of topics
2024-07-25 17:28:46,403 - BERTopic - Topic reduction - Reduced number of topics from 7 to 5


In [17]:
# Calculate and visualize the topics over time
topics_over_time = topic_model.topics_over_time(df['clean_text'], timestamps, global_tuning=True, evolution_tuning=True, nr_bins=20)

14it [00:09,  1.47it/s]


In [18]:
# New: Utilize additional visualization methods
topic_model.visualize_barchart(top_n_topics=10)

In [20]:
topic_model.visualize_term_rank()

In [21]:
topic_model.visualize_topics()

In [26]:
topic_model.visualize_heatmap(n_clusters=3, custom_labels=True)

In [27]:
# Visualize the topics over time
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, custom_labels=True)

In [119]:
# %% [code]
# Convert the topics_over_time DataFrame to a CSV file and save it in the current working directory
topics_over_time.to_csv("topics_over_time.csv", index=False)

In [28]:
# %% [code]
# Preview of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,459,-1_economy_sharing_sharing economy_research,-1_economy_sharing_sharing economy_research,"[economy, sharing, sharing economy, research, ...","[entrepreneurship, sharing economy, business, ...","[economy, sharing, sharing economy, research, ...","[economy, sharing, research, digital, gig, pla...",[Analysis of the evolution of the sharing econ...
1,0,1201,0_airbnb_peer_accommodation_study,Topic 1 - Peer to Peer Accomodation,"[airbnb, peer, accommodation, study, sharing, ...","[peer accommodation, p2p accommodation, touris...","[airbnb, peer, accommodation, study, sharing, ...","[peer, accommodation, study, sharing, tourism,...",[Creating customer value in the sharing econom...
2,1,523,1_ride_sharing_car_mobility,Topic 2 - Mobility & Vehicle Sharing,"[ride, sharing, car, mobility, hailing, ride h...","[car sharing, carsharing, ride sharing, ridesh...","[ride, sharing, car, mobility, hailing, ride h...","[ride, sharing, car, mobility, hailing, servic...",[Understanding the intention to use bike-shari...
3,2,516,2_economy_sharing_sharing economy_business,Topic 3 - Sharing Economy - General,"[economy, sharing, sharing economy, business, ...","[sharing economy, circular economy, sustainabl...","[economy, sharing, sharing economy, business, ...","[economy, sharing, business, development, sust...",[Sharing Model in Circular Economy towards Rat...
4,3,81,3_fashion_luxury_renting_clothing,Topic 4 - Fashion and Luxury Renting,"[fashion, luxury, renting, clothing, consumpti...","[fashion consumption, fashion renting, online ...","[fashion, luxury, renting, clothing, consumpti...","[fashion, luxury, renting, clothing, consumpti...",[Investigating consumer attitudes and intentio...


In [48]:
# Extract C-TF-IDF scores and words
c_tf_idf = topic_model.c_tf_idf_
words = topic_model.get_topic_info()

# Convert C-TF-IDF scores to a DataFrame
c_tf_idf_df = pd.DataFrame(c_tf_idf.toarray(), columns=topic_model.vectorizer_model.get_feature_names_out())

# Save the C-TF-IDF scores DataFrame to a CSV file
c_tf_idf_df.to_csv('wos_c_tf_idf_scores.csv', index=False)

# Extract the topic words and their scores
topic_words = topic_model.get_topics()

# Convert the topic words and scores to a DataFrame
all_topic_words = []
for topic_num, word_list in topic_words.items():
    for word, score in word_list:
        all_topic_words.append((topic_num, word, score))

df_topic_words = pd.DataFrame(all_topic_words, columns=['Topic', 'Word', 'Score'])

# Save the topic words and scores DataFrame to a CSV file
df_topic_words.to_csv('wos_topic_words_scores.csv', index=False)

In [25]:
topic_term_matrix = topic_model.c_tf_idf_
words = topic_model.vectorizer_model.get_feature_names_out()
df = pd.DataFrame(topic_term_matrix.toarray(), columns=words)  # Use toarray() if the matrix is sparse
# Save to CSV file for easy viewing and sharing
df.to_csv('trial_topic_term_matrix.csv', index=True)

In [29]:
# %% [code]
# Visualize the hierarchy
topic_model.visualize_hierarchy(top_n_topics=10, custom_labels=True)  # Adjust `top_n_topics` as needed

In [25]:
# Create a dictionary for renaming topics
new_topic_names = {
    0: "Topic 1 - Peer to Peer Accomodation",
    1: "Topic 2 - Mobility & Vehicle Sharing",
    2: "Topic 3 - Sharing Economy - General",
    3: "Topic 4 - Fashion and Luxury Renting",
}

# Set the new topic labels
topic_model.set_topic_labels(new_topic_names)