# Strandbox

## Installation

Before installing any packages, make sure to turn on Colab GPU as: Runtime > Change Runtime Type > T4 GPU

In [None]:
%%capture
!pip install git+https://github.com/MaartenGr/BERTopic.git@master
!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64
!pip install safetensors
!pip install datasets
!pip install datashader
!pip install adjustText

In [None]:
import nltk
nltk.download('stopwords')

## Convert Text into Bert Chunks

Most of the transformer based models accept token length of 512, therefore we convert text into small chunks of 512 tokens.

### Setup

In [None]:
import pandas as pd
import json
from tqdm import tqdm

def create_chunks(all_text):
  all_chunks = []
  curr_chunk = ''
  prev_len = 0
  max_len = 512
  for text in tqdm(all_text):
    l = len(tokenizer(text)['input_ids'])
    if l+prev_len < max_len:
      curr_chunk += text
      prev_len += l
    else:
      all_chunks.append(curr_chunk)
      curr_chunk = text
      prev_len = l
  return all_chunks




In [None]:
import os

from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2', truncation=True,
                                          padding=True,max_length=512,return_tensors='pt')

### Processing

Here we process the text extracted from scientific articles. Input file is in json format, which is prepared by the pipeline introduced in [ArenaBox](https://github.com/arenabox/ArenaBox). Here we follow example of json file of EIST journal.

Sample format of json file looks as follows:


```
"1": {
        "file_name": "-It-s-not-talked-about---The-risk-of-failure-_2020_Environmental-Innovation-",
        "doi": "10.1016/j.eist.2020.02.008",
        "title": "\u201cIt's not talked about\u201d: The risk of failure in practice in sustainability experiments",
        "abstract": "Scholars of sustainability transition have given much attention to local experiments in .....",
        "text": {
            "Introduction":...,
            ..
            "Conclusion": ...,
        },
        "location": "UK",
        "raw_text": " A transition away from the use of fossil fuels ..."
        "year": "2020"
    },
    "2": {
      ....
    }

```



In [None]:
import json
base_path = 'PATH/TO/EIST/FOLDER'
with open(f'{base_path}/EIST_PDFS_TM.json', 'r') as f:
  eist = json.load(f)
f.close()

In [None]:
id2title = dict()

for id, d in eist.items():
  id2title[f'eist-{id}'] = d['title']

In [None]:
for fn, fd in eist.items():
  raw_text = fd['raw_text'].split('.')
  raw_text_chunks = create_chunks(raw_text)
  eist[fn]['raw_text_chunks'] = raw_text_chunks

In [None]:
text2id = dict()

for fn, fd in eist.items():
  for text in fd['raw_text_chunks']:
    text2id[text] = f'eist-{fn}'

In [None]:
data = list(text2id.keys())

In [None]:
# Here we save a list of 512 tokens text in a json file
with open(f'{base_path}/only_text.json', 'w+') as f:
  json.dump({'text': data}, f, indent=4)

## Data Preparation

Here we convert chunked data into embeddings to be processed by topic models. We create multiple files which will be used by topic model. Instead of creating these files everytime we perform topic model, we will save it once and use it whenever we need to do topic modelling.

### Setup

In [None]:
from sentence_transformers import SentenceTransformer

def get_embeddings(embedding_model, data):

  embeddings = embedding_model.encode(data, show_progress_bar=True)
  return embeddings

### Create and Save Embeddings files

In [None]:
import collections
from tqdm import tqdm
import pickle
from sklearn.feature_extraction.text import CountVectorizer

# Extract vocab to be used in BERTopic
vocab = collections.Counter()
tokenizer = CountVectorizer().build_tokenizer()
for doc in tqdm(data):
  vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 15]; len(vocab)

with open(f'{base_path}/vocab.txt', 'wb') as fp:
    pickle.dump(vocab, fp)

In [None]:
import numpy as np

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = get_embeddings(embedding_model, data)

with open(f'{base_path}/embeddings.npy', 'wb') as f:
    np.save(f, embeddings)

In [None]:
from cuml.manifold import UMAP

# Train model and reduce dimensionality of embeddings
umap_model = UMAP(n_components=5, n_neighbors=15, random_state=42, metric="cosine", verbose=True)
reduced_embeddings = umap_model.fit_transform(embeddings)

with open(f'{base_path}/umap_embeddings.npy', 'wb') as f:
    np.save(f, reduced_embeddings)

In [None]:
from cuml.manifold import UMAP

# Train model and reduce dimensionality of embeddings
umap_model = UMAP(n_components=2, n_neighbors=15, random_state=42, metric="cosine", verbose=True)
reduced_embeddings_2d = umap_model.fit_transform(embeddings)

with open(f'{base_path}/umap_2d_embeddings.npy', 'wb') as f:
    np.save(f, reduced_embeddings_2d)

## Topic Modelling

### Setup

In [None]:
import pandas as pd
import json
from tqdm import tqdm


def get_data(data_path, typ= 'json'):
  if typ == 'txt':
    with open(data_path, 'r',encoding="utf-8") as f:
      data = f.read().replace('\n', '').rstrip()
    f.close()
    all_text = data.split('.')
    all_chunks = create_chunks(all_text)
  else:
    with open(data_path, 'r') as f:
      data = json.load(f)
    f.close()
    all_text = []
    if 'text' not in data:
      for user, user_data in data.items():
        text = user_data['text']
        all_text.extend(text)
      all_text = ' '.join(all_text).split('.')
      all_chunks = create_chunks(all_text)
    else:
      all_chunks = data['text']

  return all_chunks

In [None]:
import itertools
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from adjustText import adjust_text
import matplotlib.patheffects as pe
import textwrap

def advanced_visualization(topic_model, reduced_embeddings, data):

  # Define colors for the visualization to iterate over
  colors = itertools.cycle(['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'])
  color_key = {str(topic): next(colors) for topic in set(topic_model.topics_) if topic != -1}

  # Prepare dataframe and ignore outliers
  df = pd.DataFrame({"x": reduced_embeddings[:, 0], "y": reduced_embeddings[:, 1], "Topic": [str(t) for t in topic_model.topics_]})
  df["Length"] = [len(doc) for doc in data]
  df = df.loc[df.Topic != "-1"]
  df = df.loc[(df.y > -10) & (df.y < 10) & (df.x < 10) & (df.x > -10), :]
  df["Topic"] = df["Topic"].astype("category")

  # Get centroids of clusters
  mean_df = df.groupby("Topic").mean().reset_index()
  mean_df.Topic = mean_df.Topic.astype(int)
  mean_df = mean_df.sort_values("Topic")

  fig = plt.figure(figsize=(20, 15))
  sns.scatterplot(data=df, x='x', y='y', c=df['Topic'].map(color_key), alpha=0.4, sizes=(0.6, 10), size="Length")

  # Annotate top 50 topics
  texts, xs, ys = [], [], []
  for row in mean_df.iterrows():
    topic = row[1]["Topic"]
    name = textwrap.fill(topic_model.custom_labels_[int(topic)], 20)

    if int(topic) <= 50:
      xs.append(row[1]["x"])
      ys.append(row[1]["y"])
      texts.append(plt.text(row[1]["x"], row[1]["y"], name, size=10, ha="center", color=color_key[str(int(topic))],
                            path_effects=[pe.withStroke(linewidth=0.5, foreground="black")]
                            ))

  # Adjust annotations such that they do not overlap
  adjust_text(texts, x=xs, y=ys, time_lim=1, force_text=(0.01, 0.02), force_static=(0.01, 0.02), force_pull=(0.5, 0.5))
  plt.axis('off')
  plt.legend('', frameon=False)
  plt.show()

In [None]:
## ATTENTION !!!!


## Run this cell only if you want to label topic using Large Language Model Llama.

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import LlamaTokenizer, LlamaForCausalLM, pipeline

# System prompt describes information given to all conversations
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""
# Example prompt demonstrating the output we are looking for
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.

[/INST] Environmental impacts of eating meat
"""

# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""
prompt = system_prompt + example_prompt + main_prompt




tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-13B-chat-GPTQ")

model = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-13B-chat-GPTQ", device_map='auto')

pipe = pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

def get_labels(topic_model):
  labels = []
  for i in list(topic_model.get_topic_info()['Llama2']):
    t = i[0].split('\n')
    if len(t)== 1:
      labels.append(t[0])
    else:
      if t[0] != '':
        labels.append(t[0])
      else:
        if t[1].startswith('Label:'):
          p = t[1].split('Label:')[1]
          labels.append(p)
        else:
          labels.append(t[1])
  return labels

In [None]:

from bertopic import BERTopic
from bertopic.cluster import BaseCluster
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech, TextGeneration
import pickle
import numpy as np
import os
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

class Dimensionality:
  """ Use this for pre-calculated reduced embeddings """
  def __init__(self, reduced_embeddings):
    self.reduced_embeddings = reduced_embeddings

  def fit(self, X):
    return self

  def transform(self, X):
    return self.reduced_embeddings


class TopicModelling:
  def __init__(self, base_path, embedding_model='all-MiniLM-L6-v2'):
    self.base_path = base_path
    self.embedding_model_name = embedding_model
    self.prepare_paths()

  def save_model(self):
      save_path = os.path.join(self.base_path, 'final')
      self.topic_model.save(save_path, serialization="safetensors",
                            save_ctfidf=True, save_embedding_model=self.embedding_model_name)


  def load_model(self, model_name='final'):
    self.topic_model = BERTopic.load(os.path.join(self.base_path, model_name))
    self.load_embeddings()
    self.load_data()
    print('Model Loaded Successfully')
    return self.topic_model

  def prepare_paths(self):
    self.data_path = os.path.join(self.base_path, 'only_text.json')
    self.embeddings_path = os.path.join(self.base_path, 'embeddings.npy')
    self.umap_embeddings_path = os.path.join(self.base_path, 'umap_embeddings.npy')
    self.umap_embeddings_path_2d = os.path.join(self.base_path, 'umap_2d_embeddings.npy')
    self.vocab_path = os.path.join(self.base_path, 'vocab.txt')

  def load_data(self):
    print('Fetching data....')
    self.data = get_data(self.data_path)

  def load_embeddings(self):
    print('Fetching Embeddings...')
    self.embeddings = np.load(self.embeddings_path)
    self.reduced_embeddings = np.load(self.umap_embeddings_path)
    self.reduced_embeddings_2d = np.load(self.umap_embeddings_path_2d)

  def load_vocab(self):
    print('Fetching vocab...')
    with open (self.vocab_path, 'rb') as fp:
        self.vocab = pickle.load(fp)

  def load_models(self):
    print('Loading Models...')
    self.embedding_model = SentenceTransformer(self.embedding_model_name)
    self.umap_model = Dimensionality(self.reduced_embeddings)
    self.hdbscan_model = BaseCluster()
    # Find clusters of semantically similar documents
    hdbscan_model = HDBSCAN(min_samples=30, gen_min_span_tree=True, prediction_data=False, min_cluster_size=30, verbose=True)
    self.clusters = hdbscan_model.fit(self.reduced_embeddings).labels_
    sw = stopwords.words()
    self.vectorizer_model = CountVectorizer(vocabulary=self.vocab, stop_words=sw)
    keybert_model = KeyBERTInspired()

    # Part-of-Speech
    pos_model = PartOfSpeech("en_core_web_sm")

    # MMR
    mmr_model = MaximalMarginalRelevance(diversity=0.3)

    #Uncomment following line for using llama model for labelling
    #llama2 = TextGeneration(pipe, prompt=prompt)

    # All representation models
    self.representation_model = {
        "KeyBERT": keybert_model,
        # "OpenAI": openai_model,  # Uncomment if you will use OpenAI
        "MMR": mmr_model,
        "POS": pos_model,
        # "Llama2": llama2, # Uncomment for using Llama
    }

  def get_topic_model(self):
    self.load_data()
    self.load_embeddings()
    self.load_vocab()
    self.load_models()
    print('Modelling...')
    self.topic_model= BERTopic(
            embedding_model=self.embedding_model,
            umap_model=self.umap_model,
            hdbscan_model=self.hdbscan_model,
            vectorizer_model=self.vectorizer_model,
            representation_model=self.representation_model,
            verbose=True
    ).fit(self.data, embeddings=self.embeddings, y=self.clusters)

    #Uncomment for using llama labels
    #llama2_labels = [label[0][0].split("\n")[0] for label in self.topic_model.get_topics(full=True)["Llama2"].values()]
    #self.topic_model.set_topic_labels(llama2_labels)

    return self.topic_model



### Modelling

In [None]:

tp = TopicModelling(base_path=base_path)
topic_model = tp.get_topic_model()

topic_model.get_topic_info()

# Check https://github.com/MaartenGr/BERTopic to use other functionalities BERTopic

#### Optional

In [None]:
# Save Model to the base path
tp.save_model()

In [None]:
# Load Saved Model
tp = TopicModelling(base_path=base_path)
topic_model = tp.load_model()

In [None]:
# Sometimes labelled topics using llama are incorrect, using following code we can fix it
llama2_labels = get_labels(topic_model)
topic_model.set_topic_labels(llama2_labels)

### Visualization

In [None]:
topic_model.visualize_hierarchy(custom_labels=False)  # Make custom_labels=True if llama model was used

In [None]:
topic_model.visualize_documents(tp.data, custom_labels=False) # Make custom_labels=True if llama model was used

In [None]:
# only if llama model was used for labels
advanced_visualization(topic_model, tp.reduced_embeddings_2d, tp.data)