# Topic Modeling

Using BERTopic

## Set up environment

In [None]:
!pip install transformers
!pip install torch
!pip install datasets
!pip install bertopic[flair]

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd gdrive/My\ Drive/amicus-iv

Mounted at /content/gdrive
/content/gdrive/My Drive/amicus-iv


you'll need to enable GPUs for the notebook:

- Navigate to Edit→Notebook Settings
- select GPU from the Hardware Accelerator drop-down

In [None]:
import pandas as pd
import numpy as np
from html import unescape

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter

from transformers import AutoTokenizer
from datasets import load_dataset, load_metric, Dataset

from huggingface_hub import notebook_login

from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Data

BERTopic function takes a list of documents, so we need to set this up ourselves. 

## Option 0: Read in text from drive

I have saved a file on google drive called "data/amicus_text_512.csv" which contains the result of following the steps of option 1 below. since this produces the same results each time, we don't need to keep re-running it.

In [None]:
df = pd.read_csv('data/amicus_text_512.csv')
df.head(5)

Unnamed: 0,case,id,brief,brief_party,text
0,Rust v Sullivan,861819857503,"Rust v Sullivan. Amici Brief for Respondent, b...",0,abortion battle conflict enumerated right life...
1,Rust v Sullivan,861819857503,"Rust v Sullivan. Amici Brief for Respondent, b...",0,"annual difficult case, ( based numbers since 1..."
2,Rust v Sullivan,861819857503,"Rust v Sullivan. Amici Brief for Respondent, b...",0,", believe state's interest protecting human li..."
3,Rust v Sullivan,861819857503,"Rust v Sullivan. Amici Brief for Respondent, b...",0,younger best facilities. least two definitions...
4,Rust v Sullivan,861819857503,"Rust v Sullivan. Amici Brief for Respondent, b...",0,"facilities. reasons, studies may apparently di..."


### Option 1: Tokenize text then decode back to original text

In [None]:
!git config --global credential.helper store
# get access token on Huggingface website > settings > access token (make sure it's a write token)
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


Read in HF dataset

In [None]:
ds_path = 'repro-rights-amicus-briefs/repro-rights-amicus'
# use_auth_token must be true bc this is a private dataset
ds = load_dataset(ds_path, use_auth_token=True)

# remove html characters
ds = ds.map(
    lambda x: {'text': [unescape(o) for o in x['text']]}, batched=True
)

# remove stop words for topic modeling
stop_words = stopwords.words("english")
stopwords_dict = Counter(stop_words)

ds = ds.map(
    lambda x: {'text' : ' '.join([word for word in x['text'].split() if word not in stopwords_dict])}, batched=False
)

Tokenize

In [None]:
model_tokenizer = 'bert-base-uncased'

#instantiate tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_tokenizer)

# split documents into text of size 512 tokens
def tokenize_and_split(examples):
    result = tokenizer(
        examples["text"],
        truncation = True,
        max_length = 512,#512,
        stride = 128,
        return_overflowing_tokens = True,
        padding = 'max_length'
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

# tokenize
tokenized_ds = ds.map(tokenize_and_split, batched = True, batch_size = 100)

# decode tokenized text back to original text 
def decode_chunks(example):
  result = tokenizer.batch_decode(
      example['input_ids'],
      skip_special_tokens=True,
      clean_up_tokenization_spaces=True
  )
  example['text_chunk'] = result
  return example

# decode
tokenized_ds = tokenized_ds.map(decode_chunks, batched=True, batch_size=100)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/65 [00:00<?, ?ba/s]

Put document chunks into a list (since bertopic model only takes lists)

In [None]:
# new way using decoded tokenized text
sequences = tokenized_ds['train']['text_chunk'] + tokenized_ds['valid']['text_chunk'] + tokenized_ds['test']['text_chunk']
case = tokenized_ds['train']['case'] + tokenized_ds['valid']['case'] + tokenized_ds['test']['case']
brief_ids = tokenized_ds['train']['id'] + tokenized_ds['valid']['id'] + tokenized_ds['test']['id']
brief_names = tokenized_ds['train']['brief'] + tokenized_ds['valid']['brief'] + tokenized_ds['test']['brief']
brief_party = tokenized_ds['train']['brief_party'] + tokenized_ds['valid']['brief_party'] + tokenized_ds['test']['brief_party']

# check we have the results we expect
print(type(sequences))
print(type(sequences[0]))
print(len(sequences))

<class 'list'>
<class 'str'>
11501


In [None]:
# save
df = pd.DataFrame({'case': case,
                   'id': brief_ids,
                   'brief' : brief_names,
                   'brief_party': brief_party,
                   'text': sequences})
df.to_csv('data/amicus_text_512.csv', index=False)

### Option 2: split text by words 

Define function to split text into 512 words. Since we aren't using huggingface pipelines, we have to make this rough cut and be okay with the fact that we're introducing inefficiencies into our process. 

In [None]:
def split_text(text, n):
  # split text on space
  text = text.split()
  # grab tokens back into strings, with n words each 
  text = [' '.join(text[i:i+n]) for i in range(0,len(text),n)]

  return text

In [None]:
n = 512
df_512 = df.copy()
df_512['txt_split'] = df_512.apply(lambda row: split_text(row['txt_short'], n), axis=1)
df_512 = df_512.explode('txt_split')
df_512.drop('txt_short', axis=1, inplace=True)
df_512.rename({'txt_split': 'text'}, axis=1, inplace=True)
len(df_512)

11804

In [None]:
df_512.head(1)

Unnamed: 0,case,brief,id,text
0,Anders v Floyd,Anders v Floyd - amicus brief for appellant (o...,861815186515,many roe v wade killings are murder the eviden...


Make a list of documents -- do not shuffle! 

In [None]:
list_512 = list(df_512['text'])

# Part 1: Use fine-tuned transformer

Flair allows you to choose almost any 🤗 transformers model. Select any public model from the HF model hub and pass it to BERTopic.

In [None]:
model_checkpoint = 'repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus'

## Training

So, we can use our fine-tuned model here! Here, we use bert-base-uncased finetuned on our reproductive rights amicus.

Note that you have to make the model public in order to do this. 

**Only do this once! Skip to 'load saved model' section if this has already been completed**

Takes 9 minutes.

In [None]:
# init embeddings and model
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)
bbu_ft_tm = BERTopic(embedding_model=bbu_ft_embed, language = 'english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
11501it [08:11, 23.40it/s]
2022-03-21 19:16:12,594 - BERTopic - Transformed documents to Embeddings
2022-03-21 19:16:45,045 - BERTopic - Reduced dimensionality with UMAP
2022-03-21 19:16:52,868 - BERTopic - Clustered UMAP embeddings with HDBSCAN


Fit the model to our data (9 mins)

In [None]:
# fit model
bbu_ft_topics, bbu_ft_probs = bbu_ft_tm.fit_transform(df['text'])

11501it [08:44, 21.92it/s]
2022-03-21 19:31:17,761 - BERTopic - Transformed documents to Embeddings
2022-03-21 19:31:31,987 - BERTopic - Reduced dimensionality with UMAP
2022-03-21 19:31:40,939 - BERTopic - Clustered UMAP embeddings with HDBSCAN


Save model

In [None]:
bbu_ft_tm.save('bbu_rramicus')

## Extract Topics

Only do this once; once saved, skip to next section.

In [None]:
bbu_ft_freq = bbu_ft_tm.get_topic_info()
bbu_ft_freq.head(10)

Unnamed: 0,Topic,Count,Name
0,-1,5933,-1_abortion_court_state_right
1,0,378,0_respectfully_counsel_submitted_attorney
2,1,243,1_zone_buffer_content_speech
3,2,204,2_casey_undue_burden_regulations
4,3,178,3_injunction_speech_picketing_public
5,4,178,4_intact_tr_dr_chasen
6,5,166,5_adolescents_adolescent_study_psychological
7,6,139,6_conclusion_reasons_judgment_foregoing
8,7,138,7_title_planning_program_services
9,8,123,8_clinic_1993_violence_rescue


Save files -- make sure to change file names! 

In [None]:
# full list of topics + associated words
bbu_ft_freq.to_csv('output/topics_bbu_rramicus.csv', index=False)

# classification by paragraph
topic_id = bbu_ft_freq[['Topic', 'Name']]
output_df = df.copy()
output_df['topic_id'] = bbu_ft_topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)
output_df.to_csv('output/topic_classification_bbu_rramicus.csv', index=False)

Get similar topics to a word

In [None]:
similar_topics, similarity = bbu_ft_tm.find_topics("physician", top_n=5)
print(similar_topics)
bbu_ft_tm.get_topic(similar_topics[1])

[121, 83, 144, 55, 131]


[('462', 0.02527786069940472),
 ('physician', 0.022643759939560082),
 ('akron', 0.022515536622137674),
 ('compelling', 0.017801932667464677),
 ('state', 0.013705446020272091),
 ('minor', 0.01338338044260382),
 ('decision', 0.01295395091522947),
 ('city', 0.012829252289062735),
 ('patient', 0.01267554983802088),
 ('medical', 0.012218936239160334)]

In [None]:
representative_docs = bbu_ft_tm.get_representative_docs(85)
representative_docs

["statute purpose evading judicial review. 2 b. section 6 ( 1 ) illinois abortion law, amended 1979, prescribed standard care doctor performing abortion “ after fetus known viable ” ( 84 - 1379 j. s. app. 6 ). november 16, 1979, four days statute became effective, district court preliminarily enjoined enforcement, finding incorporated unconstitutional definition viability ( id. 3 ). september 1983, illinois legislature amended definition viability, providing fetus viable “ when, medical judgment attending physician based particular facts case him, reasonable likelihood sustained survival fetus outside womb, without artificial support ” ( id. 7, 58 ). october 14, 1983, district court sustained amended definition * 13 viability, held section 6 ( 1 ) thus constitutional, lifted preliminary injunction ( 579 f. supp. 466, 469 ). june 30, 1984, instant appeal pending seventh circuit, illinois legislature amended section 6 ( 1 ) ( see 84 - 1379 j. s. app. 7, 57 - 58 ). court appeals recognize

## Load saved model

Run this code if the previous sections (training, extract topics) have already been run once

In [None]:
# read in model
bbu_ft_tm = BERTopic.load("bbu_rramicus")
# frequency of each topic
bbu_ft_freq = bbu_ft_tm.get_topic_info()
# full topics
bbu_ft_topics = bbu_ft_tm.get_topics()

## Visualize Topics

In [None]:
bbu_ft_tm.visualize_topics()



## Topics per class

We can divide up the topics into those that appear in one class vs the other (fem briefs and opp briefs)

In [None]:
topics_per_class = bbu_ft_tm.topics_per_class(sequences, bbu_ft_topics, brief_party)
topics_per_class.head(10)

Unnamed: 0,Topic,Words,Frequency,Class
0,-1,"the, of, to, in, abortion",3693,0
1,0,"speech, or, injunction, hobbs, the",991,0
2,1,"ul0, b0, of, at, in",1005,0
3,2,"casey, undue, burden, at, regulations",240,0
4,3,"title, program, funds, regulations, 1008",139,0
5,4,"murder, supreme, unborn, constitution, person",273,0
6,5,"texas, hb2, clinics, women, http",48,0
7,6,"minor, parental, notification, parents, decision",68,0
8,7,"my, me, was, baby, had",47,0
9,8,"roe, constitutional, court, wade, this",69,0


In [None]:
#fem_brief_bbu_topics = topics_per_class[topics_per_class['Class']==1].drop(['Class'],axis=1,inplace=False)
bbu_ft_tm.visualize_topics_per_class(topics_per_class, top_n_topics=5, normalize_frequency=True)

## Reduce n topics

This is a manual decision

In [None]:
#new_topics, new_probs = topic_model.reduce_topics(list_512, topics, probs, nr_topics=60)

## Topic hierarchy

Another way to visually examine how topcis are related to one another. Just from looking on this, I think it would make more sense to topic model pro-women and pro-opp briefs separately, since they often use similar language/topics but are articulating very different points on them! 

In [None]:
topic_model.visualize_hierarchy(top_n_topics=25)

## Topic Similarity

Having generated topic embeddings, through both c-TF-IDF and embeddings, we can create a similarity matrix by simply applying cosine similarities through those topic embeddings. The result will be a matrix indicating how similar certain topics are to each other.

In [None]:
topic_model.visualize_heatmap(n_clusters=10, width=1000, height=1000)

# Part 2: Seed topics (*skip*)

https://maartengr.github.io/BERTopic/api/bertopic.html

In [None]:
seed_topic_list = [['physician', 'doctor', 'medical professional', 'medical expert'], ['women', 'mother']]
seed_topic_model = BERTopic(language = 'english', calculate_probabilities=True, verbose=True,
                            seed_topic_list = seed_topic_list)
seed_topics, seed_probs = seed_topic_model.fit_transform(sequences)

Batches:   0%|          | 0/498 [00:00<?, ?it/s]

2022-03-16 17:40:37,439 - BERTopic - Transformed documents to Embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2022-03-16 17:41:03,827 - BERTopic - Reduced dimensionality with UMAP
2022-03-16 17:41:24,975 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
seed_freq = seed_topic_model.get_topic_info()
seed_freq.head(10)

Unnamed: 0,Topic,Count,Name
0,-1,5744,-1_the_to_of_and
1,0,473,0_parental_minors_parents_minor
2,1,372,1_court_federal_courts_legislative
3,2,303,2_mortality_pregnancy_complications_deaths
4,3,265,3_hill_zone_buffer_speech
5,4,224,4_roe_privacy_wade_right
6,5,200,5_louisiana_620_privileges_admitting
7,6,180,6_psychological_women_study_mental
8,7,179,7_children_parents_family_parental
9,8,171,8_human_life_being_we


In [None]:
seed_women_similar_topics, seed_women_similarity = seed_topic_model.find_topics("women's rights", top_n=5)
print(seed_women_similar_topics)
seed_topic_model.get_topic(seed_women_similar_topics[0])

[165, 139, 86, 91, 127]


[('women', 0.014136297774918507),
 ('laws', 0.011999050580616942),
 ('illegal', 0.010237065081556607),
 ('enforcement', 0.009734576408527344),
 ('dying', 0.0094023498417061),
 ('abortions', 0.009385982433003582),
 ('rape', 0.008878073201327344),
 ('incest', 0.008369399996816313),
 ('prosecution', 0.0077237846543999214),
 ('criminal', 0.007508319106164667)]

In [None]:
seed_phys_similar_topics, seed_phys_similarity = seed_topic_model.find_topics("doctor", top_n=5)
print(seed_phys_similar_topics)
seed_topic_model.get_topic(seed_phys_similar_topics[1])

[164, 66, 77, 93, 11]


[('hospital', 0.02913038745081937),
 ('credentialing', 0.025738828794159216),
 ('privileges', 0.022390621401369354),
 ('hospitals', 0.019021824083630595),
 ('care', 0.016055733923885766),
 ('staff', 0.015923603500972786),
 ('physician', 0.015268006792430806),
 ('ms', 0.015124700368570216),
 ('admitting', 0.012263187376225206),
 ('physicians', 0.011998829970855503)]

# Part 3: Split into fem and opp

In this section, we fit models for fem and opp briefs separately in order to get more specific topic information. 

In [None]:
# split data
opp_df = df[df['brief_party']==0]
fem_df = df[df['brief_party']==1]

## Fem topic model

Init existing topic model again so we don't over-write existing model.

In [None]:
fem_tm = BERTopic.load("bbu_rramicus")

Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
fem_topics, fem_probs = fem_tm.fit_transform(fem_df['text'])

5227it [05:18, 16.42it/s]


In [None]:
fem_topic_info = fem_tm.get_topic_info()
fem_topic_info.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,2210,-1_court_abortion_state_right
1,0,607,0_texas_women_abortion_2015
2,1,130,1_counsel_respectfully_street_submitted
3,2,92,2_clinic_1993_dr_violence
4,3,82,3_would_could_able_abortion


In [None]:
fem_tm.visualize_topics()

Find topics

In [None]:
similar_topics, similarity = fem_tm.find_topics("medical", top_n=5)
print(similar_topics)
print(similarity)
for i in range(len(similar_topics)):
  print(fem_tm.get_topic(similar_topics[i]))

[57, 46, -1, 35, 59]
[0.9860490372255521, 0.9815115778404839, 0.9811166197960979, 0.9800830365274396, 0.9793343559125567]
[('act', 0.01543954118686647), ('emergency', 0.014231235818460135), ('medical', 0.013855040367193103), ('physician', 0.01369661624339168), ('patient', 0.013609853728641), ('hampshire', 0.012316739126711787), ('health', 0.012138903189156435), ('abortion', 0.01207191176826808), ('physicians', 0.011911218625010789), ('judge', 0.011798782385998659)]
[('organization', 0.03879029805705921), ('rights', 0.028612205730736546), ('reproductive', 0.02238318008521494), ('national', 0.021545567586049752), ('women', 0.021178794951134908), ('civil', 0.01938167511672356), ('legal', 0.01754986814847482), ('education', 0.016915347950587963), ('health', 0.01645128531672538), ('advocacy', 0.015872087366163745)]
[('court', 0.009003044817547996), ('abortion', 0.0078714103351809), ('state', 0.007542976329646191), ('right', 0.006502928377333628), ('health', 0.006451944211089597), ('women', 

Save

In [None]:
# full list of topics + associated words
fem_topic_info.to_csv('output/fem_topics_bbu_rramicus.csv', index=False)

# classification by paragraph
topic_id = fem_topic_info[['Topic', 'Name']]
output_df = fem_df.copy()
output_df['topic_id'] = fem_topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)
output_df.to_csv('output/fem_topic_classification_bbu_rramicus.csv', index=False)

In [None]:
fem_tm.save('fem_bbu_rramicus')

## Opp topic model

Init existing topic model again so we don't over-write existing model.

In [None]:
opp_tm = BERTopic.load("bbu_rramicus")

Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
opp_topics, opp_probs = opp_tm.fit_transform(opp_df['text'])

6274it [05:33, 18.83it/s]


In [None]:
opp_topic_info = opp_tm.get_topic_info()
opp_topic_info.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,3024,-1_abortion_court_state_right
1,0,189,0_respectfully_counsel_submitted_attorney
2,1,141,1_hobbs_extortion_act_property
3,2,138,2_life_child_unborn_human
4,3,131,3_any_supreme_unborn_evidence


In [None]:
#opp_tm.visualize_topics(top_n_topics=50)
opp_tm.visualize_topics()

In [None]:
# full list of topics + associated words
opp_topic_info.to_csv('output/opp_topics_bbu_rramicus.csv', index=False)

# classification by paragraph
topic_id = opp_topic_info[['Topic', 'Name']]
output_df = opp_df.copy()
output_df['topic_id'] = opp_topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)
output_df.to_csv('output/opp_topic_classification_bbu_rramicus.csv', index=False)

In [None]:
opp_tm.save('opp_bbu_rramicus')