# Topic Modeling

Using BERTopic

## Set up environment

you'll need to enable GPUs for the notebook:

- Navigate to Edit→Notebook Settings
- select GPU from the Hardware Accelerator drop-down

In [None]:
!pip install transformers
!pip install torch
!pip install datasets
!pip install bertopic[flair]

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd gdrive/My\ Drive/amicus-iv

Mounted at /content/gdrive
/content/gdrive/My Drive/amicus-iv


Saving locations -- change these for different models!

In [None]:
model_folder = 'topic-modeling/models/bert-base-uncased-RRamicus/'
output_folder = 'topic-modeling/output/bert-base-uncased-RRamicus/'

Model checkpoint for generating document embeddings

In [None]:
model_checkpoint = 'repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus'

Import packages

In [None]:
import pandas as pd
import numpy as np
from html import unescape
import string

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter

#from transformers import AutoTokenizer
#from datasets import load_dataset, load_metric, Dataset

from huggingface_hub import notebook_login

from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings

from sklearn.preprocessing import MinMaxScaler
from umap import UMAP
from typing import List
import hdbscan
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Define similarity function

We want to group topics based on how similar they are. This is an adoption of the visualize_topics() function

In [None]:
def get_similar_topics(topic_model,
                     topics: List[int] = None,
                     top_n_topics: int = None,
                     width: int = 650,
                     height: int = 650):
    # Select topics based on top_n and topics args
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1])
    else:
        topics = sorted(list(topic_model.get_topics().keys()))

    # Extract topic words and their frequencies
    topic_list = sorted(topics)
    frequencies = [topic_model.topic_sizes[topic] for topic in topic_list]
    words = [" | ".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list]

    # seed
    np.random.seed(11)

    # Embed c-TF-IDF into 2D
    all_topics = sorted(list(topic_model.get_topics().keys()))
    indices = np.array([all_topics.index(topic) for topic in topics])
    embeddings = topic_model.c_tf_idf.toarray()[indices]
    embeddings = MinMaxScaler().fit_transform(embeddings)
    embeddings = UMAP(n_neighbors=2, n_components=2, metric='hellinger', random_state=42).fit_transform(embeddings)

    # cluster based on above
    labels = hdbscan.HDBSCAN(min_samples=1, min_cluster_size=3).fit_predict(embeddings)

    # Visualize with plotly
    df = pd.DataFrame({"x": embeddings[1:, 0], "y": embeddings[1:, 1], 'Label':labels[1:],
                       "Topic": topic_list[1:], "Words": words[1:], "Size": frequencies[1:]})
    return df

## Data

BERTopic function takes a list of documents, so we need to set this up ourselves. 

## Option 0: Read in text from drive

I have saved a file on google drive called "data/amicus_text_512.csv" which contains the result of following the steps of option 1 below. since this produces the same results each time, we don't need to keep re-running it.

In [None]:
df = pd.read_csv('data/amicus_clean_512_no_stride_vars.csv')
df.head(1)

Unnamed: 0,case,id,brief,brief_party,forgau,antauo,chcauf,lifauo,govauf,govauo,...,legauo,relauf,relauo,lg_bauf,forgauf,aclauf,aclauo,sgauf,sgauo,text
0,Rust v Sullivan,861819857503,"Rust v Sullivan. Amici Brief for Respondent, b...",0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,abortion battle conflict enumerated right life...


Remove general noise words

In [None]:
#df_clean = df.copy()
#df_clean['text'] = df_clean['text'].str.replace('[{}]'.format(string.punctuation), '')

rmv_list = ['ii', 'https', 'al', 'et', 'per', 'www', 'llp', 'id', 'nos', 'pdf', 'http',
            'ul', 'fi', 'ri', 'sb', 'ql', 'li', 'fs',
            'circuit', 'district', 'supra', 'supp', 'decisis', 'amici', 'curiae', 'court', 'courts', 'supreme', 'appeals',
            'appeal', 'appellants', 'appellant', 'appellee', 'appellees',
            'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'nineth', 'tenth', 'eleventh', 'twelfth']

df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))
#df_clean['text'] = df_clean['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))
#df_clean['text_2'] = df_clean['text'].apply(lambda x: [word for word in x.split()])

# Legauo and Legauf

## Legauo

In [None]:
legauo_df = df[(df['legauo'] == 1)]
legauo_df = legauo_df[['case', 'id', 'brief', 'legauo', 'text']]
print("number of briefs:", legauo_df['id'].nunique())
print("number of paragraphs:", len(legauo_df))

number of briefs: 98
number of paragraphs: 760


Remove noise

In [None]:
#rmv_list = []
#legauo_df['text'] = legauo_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings and model
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)
legauo_tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3,
                     language = 'english', calculate_probabilities=True, verbose=True)

Downloading:   0%|          | 0.00/664 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/321 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
legauo_topics, legauo_probs = legauo_tm.fit_transform(legauo_df['text'])

760it [00:34, 22.08it/s]
2022-04-11 17:16:00,203 - BERTopic - Transformed documents to Embeddings
2022-04-11 17:16:12,735 - BERTopic - Reduced dimensionality with UMAP
2022-04-11 17:16:12,979 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
legauo_topic_info = legauo_tm.get_topic_info()
legauo_topic_info.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,176,-1_abortion_right_state_states
1,0,30,0_respectfully_submitted_conclusion_counsel
2,1,24,1_abortion_privacy_pain_human
4,4,19,4_commerce_interstate_federal_act
5,5,19,5_undue_burden_prong_casey


In [None]:
len(legauo_topic_info)

64

Save

In [None]:
# full list of topics
legauo_full_topics = legauo_tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(legauo_full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = legauo_topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# save
topics_df.to_csv(output_folder + 'legauo_topics_clean_bbu_rramicus.csv', index=False)

# classification by paragraph
topic_id = legauo_topic_info[['Topic', 'Name']]
output_df = legauo_df.copy()
output_df['topic_id'] = legauo_topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)
output_df.to_csv(output_folder + 'legauo_topic_clean_classification_bbu_rramicus.csv', index=False)

Next, cluster the topics using hdbscan 

In [None]:
legauo_embed = get_similar_topics(legauo_tm)

In [None]:
legauo_topic_df = pd.read_csv(output_folder + 'legauo_topics_clean_bbu_rramicus.csv')
legauo_embed = legauo_embed.sort_values('Label')
legauo_embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
legauo_embed = legauo_embed.merge(legauo_topic_df, how='left', on = 'topic_id')
legauo_embed.drop(['Words', 'Size'], axis=1, inplace=True)
legauo_embed = legauo_embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]
legauo_embed.to_csv(output_folder + 'legauo_topics_clean_labels_bbu_rramicus.csv')

Save model

In [None]:
legauo_tm.save(model_folder + 'legauo_bbu_rramicus')

## Legauf

In [None]:
legauf_df = df[(df['legauf'] == 1)]
legauf_df = legauf_df[['case', 'id', 'brief', 'legauf', 'text']]
print("number of briefs:", legauf_df['id'].nunique())
print("number of paragraphs:", len(legauf_df))

number of briefs: 74
number of paragraphs: 599


Remove noise

In [None]:
#rmv_list = []
#legauf_df['text'] = legauf_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings and model
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)
legauf_tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3,
                     language = 'english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
legauf_topics, legauf_probs = legauf_tm.fit_transform(legauf_df['text'])

599it [00:29, 20.22it/s]
2022-04-11 17:17:08,990 - BERTopic - Transformed documents to Embeddings
2022-04-11 17:17:14,404 - BERTopic - Reduced dimensionality with UMAP
2022-04-11 17:17:14,561 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
legauf_topic_info = legauf_tm.get_topic_info()
legauf_topic_info.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,133,-1_state_right_abortion_law
1,0,24,0_roe_right_interest_womans
2,1,23,1_street_counsel_respectfully_submitted
3,2,18,2_act_necessary_emergency_app
4,3,17,3_petitioners_zone_buffer_facility


In [None]:
len(legauf_topic_info)

56

Save

In [None]:
# full list of topics
legauf_full_topics = legauf_tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(legauf_full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = legauf_topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# save
topics_df.to_csv(output_folder + 'legauf_topics_clean_bbu_rramicus.csv', index=False)

# classification by paragraph
topic_id = legauf_topic_info[['Topic', 'Name']]
output_df = legauf_df.copy()
output_df['topic_id'] = legauf_topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)
output_df.to_csv(output_folder + 'legauf_topic_clean_classification_bbu_rramicus.csv', index=False)

Next, cluster the topics using hdbscan 

In [None]:
legauf_embed = get_similar_topics(legauf_tm)

In [None]:
legauf_topic_df = pd.read_csv(output_folder + 'legauf_topics_clean_bbu_rramicus.csv')
legauf_embed = legauf_embed.sort_values('Label')
legauf_embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
legauf_embed = legauf_embed.merge(legauf_topic_df, how='left', on = 'topic_id')
legauf_embed.drop(['Words', 'Size'], axis=1, inplace=True)
legauf_embed = legauf_embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]
legauf_embed.to_csv(output_folder + 'legauf_topics_clean_labels_bbu_rramicus.csv')

Save model

In [None]:
legauf_tm.save(model_folder + 'legauf_bbu_rramicus')

# Medauo and Medauf

## Medauo

In [None]:
medauo_df = df[(df['medauo'] == 1)] #| (df['medauf'] == 1)]
medauo_df = medauo_df[['case', 'id', 'brief', 'medauo', 'medauf', 'text']]
print("number of briefs:", medauo_df['id'].nunique())
print("number of paragraphs:", len(medauo_df))

number of briefs: 37
number of paragraphs: 337


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
medauo_df['text'] = medauo_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings and model
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)
medauo_tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3,
                     language = 'english', calculate_probabilities=True, verbose=True)
#medauo_tm = BERTopic(language='english', calculate_probabilities=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
medauo_topics, medauo_probs = medauo_tm.fit_transform(medauo_df['text'])

337it [00:16, 19.93it/s]
2022-04-11 17:17:54,449 - BERTopic - Transformed documents to Embeddings
2022-04-11 17:17:58,469 - BERTopic - Reduced dimensionality with UMAP
2022-04-11 17:17:58,539 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
medauo_topic_info = medauo_tm.get_topic_info()
medauo_topic_info.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,74,-1_abortion_medical_mortality_women
1,0,18,0_right_life_law_rights
2,1,17,1_conclusion_respectfully_counsel_submitted
3,2,16,2_health_abortion_medical_consent
4,3,16,3_abortion_jan_visited_available


In [None]:
len(medauo_topic_info)

31

Save

In [None]:
# full list of topics
medauo_full_topics = medauo_tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(medauo_full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = medauo_topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# save
topics_df.to_csv(output_folder + 'medauo_topics_clean_bbu_rramicus.csv', index=False)

# classification by paragraph
topic_id = medauo_topic_info[['Topic', 'Name']]
output_df = medauo_df.copy()
output_df['topic_id'] = medauo_topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)
output_df.to_csv(output_folder + 'medauo_topic_clean_classification_bbu_rramicus.csv', index=False)

Next, cluster the topics using hdbscan 

In [None]:
medauo_embed = get_similar_topics(medauo_tm)

In [None]:
medauo_topic_df = pd.read_csv(output_folder + 'medauo_topics_clean_bbu_rramicus.csv')
medauo_embed = medauo_embed.sort_values('Label')
medauo_embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
medauo_embed = medauo_embed.merge(medauo_topic_df, how='left', on = 'topic_id')
medauo_embed.drop(['Words', 'Size'], axis=1, inplace=True)
medauo_embed = medauo_embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]
medauo_embed.to_csv(output_folder + 'medauo_topics_clean_labels_bbu_rramicus.csv')

Save model

In [None]:
medauo_tm.save(model_folder + 'medauo_bbu_rramicus')

## Medauf

In [None]:
medauf_df = df[(df['medauf'] == 1)]
medauf_df = medauf_df[['case', 'id', 'brief', 'medauo', 'medauf', 'text']]
print("number of briefs:", medauf_df['id'].nunique())
print("number of paragraphs:", len(medauf_df))

number of briefs: 92
number of paragraphs: 806


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
medauf_df['text'] = medauf_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings and model
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)
medauf_tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3,
                     language = 'english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
medauf_topics, medauf_probs = medauf_tm.fit_transform(medauf_df['text'])

806it [00:38, 21.12it/s]
2022-04-11 17:18:54,985 - BERTopic - Transformed documents to Embeddings
2022-04-11 17:19:01,142 - BERTopic - Reduced dimensionality with UMAP
2022-04-11 17:19:01,409 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
medauf_topic_info = medauf_tm.get_topic_info()
medauf_topic_info.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,172,-1_abortion_health_women_medical
1,0,57,0_respectfully_counsel_conclusion_submitted
2,1,56,1_texas_women_abortion_care
3,2,36,2_amendment_enterprise_federal_hyde
4,3,28,3_roe_life_constitutional_health


In [None]:
len(medauf_topic_info)

65

Save

In [None]:
# full list of topics
medauf_full_topics = medauf_tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(medauf_full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = medauo_topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# save
topics_df.to_csv(output_folder + 'medauf_topics_clean_bbu_rramicus.csv', index=False)

# classification by paragraph
topic_id = medauo_topic_info[['Topic', 'Name']]
output_df = medauo_df.copy()
output_df['topic_id'] = medauo_topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)
output_df.to_csv(output_folder + 'medauf_topic_clean_classification_bbu_rramicus.csv', index=False)

Next, cluster the topics using hdbscan 

In [None]:
medauf_embed = get_similar_topics(medauf_tm)

In [None]:
medauf_topic_df = pd.read_csv(output_folder + 'medauf_topics_clean_bbu_rramicus.csv')
medauf_embed = medauf_embed.sort_values('Label')
medauf_embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
medauf_embed = medauf_embed.merge(medauf_topic_df, how='left', on = 'topic_id')
medauf_embed.drop(['Words', 'Size'], axis=1, inplace=True)
medauf_embed = medauf_embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]
medauf_embed.to_csv(output_folder + 'medauf_topics_clean_labels_bbu_rramicus.csv')

Save model

In [None]:
medauf_tm.save(model_folder + 'medauf_bbu_rramicus')

# Mpeauo and Mpeauf

## Mpeauo

In [None]:
mpeauo_df = df[(df['mpeauo'] == 1)] #| (df['medauf'] == 1)]
mpeauo_df = mpeauo_df[['case', 'id', 'brief', 'mpeauo', 'text']]
print("number of briefs:", mpeauo_df['id'].nunique())
print("number of paragraphs:", len(mpeauo_df))

number of briefs: 19
number of paragraphs: 158


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
mpeauo_df['text'] = mpeauo_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
                  metric='cosine', random_state=42, low_memory=False)

# init model
mpeauo_tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
mpeauo_topics, mpeauo_probs = mpeauo_tm.fit_transform(mpeauo_df['text'])

158it [00:12, 12.41it/s]
2022-04-11 18:07:39,463 - BERTopic - Transformed documents to Embeddings
2022-04-11 18:07:42,184 - BERTopic - Reduced dimensionality with UMAP
2022-04-11 18:07:42,258 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
mpeauo_topic_info = mpeauo_tm.get_topic_info()
mpeauo_topic_info.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,26,-1_abortion_care_health_medical
1,0,13,0_statute_children_unborn_state
2,1,12,1_abortion_interest_health_statute
3,2,12,2_abortion_child_women_medical
4,3,10,3_autonomy_conclusion_medical_respectfully


In [None]:
len(mpeauo_topic_info)

19

Save

In [None]:
# full list of topics
mpeauo_full_topics = mpeauo_tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(mpeauo_full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = mpeauo_topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# save
topics_df.to_csv(output_folder + 'mpeauo_topics_clean_bbu_rramicus.csv', index=False)

# classification by paragraph
topic_id = mpeauo_topic_info[['Topic', 'Name']]
output_df = mpeauo_df.copy()
output_df['topic_id'] = mpeauo_topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)
output_df.to_csv(output_folder + 'mpeauo_topic_clean_classification_bbu_rramicus.csv', index=False)

Next, cluster the topics using hdbscan 

In [None]:
mpeauo_embed = get_similar_topics(mpeauo_tm)

In [None]:
mpeauo_topic_df = pd.read_csv(output_folder + 'mpeauo_topics_clean_bbu_rramicus.csv')
mpeauo_embed = mpeauo_embed.sort_values('Label')
mpeauo_embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
mpeauo_embed = mpeauo_embed.merge(mpeauo_topic_df, how='left', on = 'topic_id')
mpeauo_embed.drop(['Words', 'Size'], axis=1, inplace=True)
mpeauo_embed = mpeauo_embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]
mpeauo_embed.to_csv(output_folder + 'mpeauo_topics_clean_labels_bbu_rramicus.csv')

Save model

In [None]:
mpeauo_tm.save(model_folder + 'mpeauo_bbu_rramicus')

## Mpeauf

In [None]:
mpeauf_df = df[(df['mpeauf'] == 1)]
mpeauf_df = mpeauf_df[['case', 'id', 'brief', 'mpeauf', 'text']]
print("number of briefs:", mpeauf_df['id'].nunique())
print("number of paragraphs:", len(mpeauf_df))

number of briefs: 53
number of paragraphs: 457


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
mpeauf_df['text'] = mpeauf_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings and model
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)
mpeauf_tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3,
                     language = 'english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
mpeauf_topics, mpeauf_probs = mpeauf_tm.fit_transform(mpeauf_df['text'])

457it [00:21, 21.64it/s]
2022-04-11 17:42:53,128 - BERTopic - Transformed documents to Embeddings
2022-04-11 17:42:58,013 - BERTopic - Reduced dimensionality with UMAP
2022-04-11 17:42:58,104 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
mpeauf_topic_info = mpeauf_tm.get_topic_info()
mpeauf_topic_info.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,109,-1_abortion_health_medical_women
1,0,35,0_texas_care_privileges_abortion
2,1,28,1_act_acts_enterprise_commerce
3,2,20,2_abortions_abortion_women_trimester
4,3,18,3_section_physician_abortion_minor


In [None]:
len(mpeauf_topic_info)

37

Save

In [None]:
# full list of topics
mpeauf_full_topics = mpeauf_tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(mpeauf_full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = medauo_topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# save
topics_df.to_csv(output_folder + 'mpeauf_topics_clean_bbu_rramicus.csv', index=False)

# classification by paragraph
topic_id = medauo_topic_info[['Topic', 'Name']]
output_df = medauo_df.copy()
output_df['topic_id'] = medauo_topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)
output_df.to_csv(output_folder + 'mpeauf_topic_clean_classification_bbu_rramicus.csv', index=False)

Next, cluster the topics using hdbscan 

In [None]:
mpeauf_embed = get_similar_topics(mpeauf_tm)

In [None]:
mpeauf_topic_df = pd.read_csv(output_folder + 'mpeauf_topics_clean_bbu_rramicus.csv')
mpeauf_embed = mpeauf_embed.sort_values('Label')
mpeauf_embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
mpeauf_embed = mpeauf_embed.merge(mpeauf_topic_df, how='left', on = 'topic_id')
mpeauf_embed.drop(['Words', 'Size'], axis=1, inplace=True)
mpeauf_embed = mpeauf_embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]
mpeauf_embed.to_csv(output_folder + 'mpeauf_topics_clean_labels_bbu_rramicus.csv')

Save model

In [None]:
mpeauf_tm.save(model_folder + 'relauf_bbu_rramicus')

# Relauo and Relauf

## Relauo

In [None]:
relauo_df = df[(df['relauo'] == 1)] #| (df['medauf'] == 1)]
relauo_df = relauo_df[['case', 'id', 'brief', 'relauo', 'text']]
print("number of briefs:", relauo_df['id'].nunique())
print("number of paragraphs:", len(relauo_df))

number of briefs: 100
number of paragraphs: 768


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
relauo_df['text'] = relauo_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings and model
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)
relauo_tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3,
                     language = 'english', calculate_probabilities=True, verbose=True)
#relauo_tm = BERTopic(language='english', calculate_probabilities=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
relauo_topics, relauo_probs = relauo_tm.fit_transform(relauo_df['text'])

768it [00:35, 21.53it/s]
2022-04-11 18:09:44,530 - BERTopic - Transformed documents to Embeddings
2022-04-11 18:09:50,148 - BERTopic - Reduced dimensionality with UMAP
2022-04-11 18:09:50,383 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
relauo_topic_info = relauo_tm.get_topic_info()
relauo_topic_info.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,204,-1_abortion_state_right_life
1,0,55,0_abortion_roe_right_state
2,1,41,1_abortion_health_procedure_statute
3,2,33,2_speech_public_injunction_amendment
4,3,18,3_person_human_fourteenth_persons


In [None]:
len(relauo_topic_info)

59

Save

In [None]:
# full list of topics
relauo_full_topics = relauo_tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(relauo_full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = relauo_topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# save
topics_df.to_csv(output_folder + 'relauo_topics_clean_bbu_rramicus.csv', index=False)

# classification by paragraph
topic_id = relauo_topic_info[['Topic', 'Name']]
output_df = relauo_df.copy()
output_df['topic_id'] = relauo_topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)
output_df.to_csv(output_folder + 'relauo_topic_clean_classification_bbu_rramicus.csv', index=False)

Next, cluster the topics using hdbscan 

In [None]:
relauo_embed = get_similar_topics(relauo_tm)

In [None]:
relauo_topic_df = pd.read_csv(output_folder + 'relauo_topics_clean_bbu_rramicus.csv')
relauo_embed = relauo_embed.sort_values('Label')
relauo_embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
relauo_embed = relauo_embed.merge(relauo_topic_df, how='left', on = 'topic_id')
relauo_embed.drop(['Words', 'Size'], axis=1, inplace=True)
relauo_embed = relauo_embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]
relauo_embed.to_csv(output_folder + 'relauo_topics_clean_labels_bbu_rramicus.csv')

Save model

In [None]:
relauo_tm.save(model_folder + 'relauo_bbu_rramicus')

## Relauf

In [None]:
relauf_df = df[(df['relauf'] == 1)]
relauf_df = relauf_df[['case', 'id', 'brief', 'relauf', 'text']]
print("number of briefs:", relauf_df['id'].nunique())
print("number of paragraphs:", len(relauf_df))

number of briefs: 22
number of paragraphs: 170


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
relauf_df['text'] = relauf_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings and model
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)
relauf_tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3,
                     language = 'english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
relauf_topics, relauf_probs = relauf_tm.fit_transform(relauf_df['text'])

170it [00:09, 18.67it/s]
2022-04-11 18:10:18,330 - BERTopic - Transformed documents to Embeddings
2022-04-11 18:10:22,358 - BERTopic - Reduced dimensionality with UMAP
2022-04-11 18:10:22,396 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
relauf_topic_info = relauf_tm.get_topic_info()
relauf_topic_info.head(5)

Unnamed: 0,Topic,Count,Name
0,0,30,0_abortion_religious_church_moral
1,-1,29,-1_abortion_law_health_religious
2,1,16,1_rights_right_free_exercise
3,2,10,2_religious_right_stat_abortion
4,3,10,3_right_state_religious_justice


In [None]:
len(relauf_topic_info)

18

Save

In [None]:
# full list of topics
relauf_full_topics = relauf_tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(relauf_full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = medauo_topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# save
topics_df.to_csv(output_folder + 'relauf_topics_clean_bbu_rramicus.csv', index=False)

# classification by paragraph
topic_id = medauo_topic_info[['Topic', 'Name']]
output_df = medauo_df.copy()
output_df['topic_id'] = medauo_topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)
output_df.to_csv(output_folder + 'relauf_topic_clean_classification_bbu_rramicus.csv', index=False)

Next, cluster the topics using hdbscan 

In [None]:
relauf_embed = get_similar_topics(relauf_tm)

In [None]:
relauf_topic_df = pd.read_csv(output_folder + 'relauf_topics_clean_bbu_rramicus.csv')
relauf_embed = relauf_embed.sort_values('Label')
relauf_embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
relauf_embed = relauf_embed.merge(relauf_topic_df, how='left', on = 'topic_id')
relauf_embed.drop(['Words', 'Size'], axis=1, inplace=True)
relauf_embed = relauf_embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]
relauf_embed.to_csv(output_folder + 'relauf_topics_clean_labels_bbu_rramicus.csv')

Save model

In [None]:
relauf_tm.save(model_folder + 'relauf_bbu_rramicus')