# Topic Modeling

Using BERTopic

## Set up environment

you'll need to enable GPUs for the notebook:

- Navigate to Edit→Notebook Settings
- select GPU from the Hardware Accelerator drop-down

In [1]:
!pip install transformers
!pip install torch
!pip install datasets
!pip install bertopic[flair]
#!pip install sentence_transformers
#!pip install bertopic

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 31.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 69.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 69.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    F

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd gdrive/My\ Drive/amicus-iv

Mounted at /content/gdrive
/content/gdrive/My Drive/amicus-iv


Saving locations -- change these for different models!

In [3]:
model_folder = 'topic-modeling/models/bert-base-uncased-RRamicus/'
output_folder = 'topic-modeling/output/bert-base-uncased-RRamicus/'

Model checkpoint for generating document embeddings

In [4]:
model_checkpoint = 'repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus'

Import packages

In [5]:
import pandas as pd
import numpy as np
from html import unescape
import string

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter

#from transformers import AutoTokenizer
#from datasets import load_dataset, load_metric, Dataset

from huggingface_hub import notebook_login

from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings

from sklearn.preprocessing import MinMaxScaler
from umap import UMAP
from typing import List
import hdbscan
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Define similarity function

We want to group topics based on how similar they are. This is an adoption of the visualize_topics() function

In [6]:
def get_similar_topics(topic_model,
                     topics: List[int] = None,
                     top_n_topics: int = None,
                     width: int = 650,
                     height: int = 650):
    # Select topics based on top_n and topics args
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1])
    else:
        topics = sorted(list(topic_model.get_topics().keys()))

    # Extract topic words and their frequencies
    topic_list = sorted(topics)
    frequencies = [topic_model.topic_sizes[topic] for topic in topic_list]
    words = [" | ".join([word[0] for word in topic_model.get_topic(topic)[:10]]) for topic in topic_list]

    # seed
    np.random.seed(11)

    # Embed c-TF-IDF into 2D
    all_topics = sorted(list(topic_model.get_topics().keys()))
    indices = np.array([all_topics.index(topic) for topic in topics])
    embeddings = topic_model.c_tf_idf.toarray()[indices]
    embeddings = MinMaxScaler().fit_transform(embeddings)
    embeddings = UMAP(n_neighbors=2, n_components=2, metric='hellinger', random_state=42).fit_transform(embeddings)

    # cluster based on above
    labels = hdbscan.HDBSCAN(min_samples=1, min_cluster_size=3).fit_predict(embeddings)

    # Visualize with plotly
    df = pd.DataFrame({"x": embeddings[1:, 0], "y": embeddings[1:, 1], 'Label':labels[1:],
                       "Topic": topic_list[1:], "Words": words[1:], "Size": frequencies[1:]})
    return df

## Data

BERTopic function takes a list of documents, so we need to set this up ourselves. 

### Read in text from drive

I have saved a file on google drive called "data/amicus_text_512.csv" which contains the result of following the steps of option 1 below. since this produces the same results each time, we don't need to keep re-running it.

In [7]:
df = pd.read_csv('data/amicus_clean_512_no_stride_vars.csv')
df.head(1)

Unnamed: 0,case,id,brief,brief_party,forgau,antauo,chcauf,lifauo,govauf,govauo,...,legauo,relauf,relauo,lg_bauf,forgauf,aclauf,aclauo,sgauf,sgauo,text
0,Rust v Sullivan,861819857503,"Rust v Sullivan. Amici Brief for Respondent, b...",0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,abortion battle conflict enumerated right life...


Remove general noise words

In [8]:
#df_clean = df.copy()
#df_clean['text'] = df_clean['text'].str.replace('[{}]'.format(string.punctuation), '')

rmv_list = ['ii', 'https', 'al', 'et', 'per', 'www', 'llp', 'id', 'nos', 'pdf', 'http',
            'ul', 'fi', 'ri', 'sb', 'ql', 'li', 'fs',
            'circuit', 'district', 'supra', 'supp', 'decisis', 'amici', 'curiae', 'court', 'courts', 'supreme', 'appeals',
            'appeal', 'appellants', 'appellant', 'appellee', 'appellees',
            'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'nineth', 'tenth', 'eleventh', 'twelfth']

df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))
#df_clean['text'] = df_clean['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))
#df_clean['text_2'] = df_clean['text'].apply(lambda x: [word for word in x.split()])

# Legauo and Legauf

## Legauo

In [None]:
var_df = df[(df['legauo'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'legauo', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 19
number of paragraphs: 158


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#var_df['text'] = var_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, #umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

158it [00:12, 12.41it/s]
2022-04-11 18:07:39,463 - BERTopic - Transformed documents to Embeddings
2022-04-11 18:07:42,184 - BERTopic - Reduced dimensionality with UMAP
2022-04-11 18:07:42,258 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
topic_info = tm.get_topic_info()
len(topic_info)

Create output files

In [None]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# create output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [None]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'legauo_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'legauo_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'legauo_bbu_rramicus')
embed.to_csv(output_folder + 'legauo_topics_clean_labels_bbu_rramicus.csv')

## Legauf

In [None]:
var_df = df[(df['legauf'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'legauf', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 19
number of paragraphs: 158


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#var_df['text'] = var_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, #umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

158it [00:12, 12.41it/s]
2022-04-11 18:07:39,463 - BERTopic - Transformed documents to Embeddings
2022-04-11 18:07:42,184 - BERTopic - Reduced dimensionality with UMAP
2022-04-11 18:07:42,258 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
topic_info = tm.get_topic_info()
len(topic_info)

Create output files

In [None]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# create output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [None]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'medauo_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'medauo_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'medauo_bbu_rramicus')
embed.to_csv(output_folder + 'medauo_topics_clean_labels_bbu_rramicus.csv')

# Medauo and Medauf

## Medauo

In [None]:
var_df = df[(df['medauo'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'medauo', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 19
number of paragraphs: 158


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#var_df['text'] = var_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, #umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

158it [00:12, 12.41it/s]
2022-04-11 18:07:39,463 - BERTopic - Transformed documents to Embeddings
2022-04-11 18:07:42,184 - BERTopic - Reduced dimensionality with UMAP
2022-04-11 18:07:42,258 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
topic_info = tm.get_topic_info()
len(topic_info)

Create output files

In [None]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# create output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [None]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'medauo_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'medauo_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'medauo_bbu_rramicus')
embed.to_csv(output_folder + 'medauo_topics_clean_labels_bbu_rramicus.csv')

## Medauf

In [9]:
var_df = df[(df['medauf'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'medauf', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 92
number of paragraphs: 810


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#var_df['text'] = var_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [10]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, #umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Downloading:   0%|          | 0.00/664 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/321 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [11]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

810it [00:37, 21.54it/s]
2022-04-14 18:06:18,351 - BERTopic - Transformed documents to Embeddings
2022-04-14 18:06:28,552 - BERTopic - Reduced dimensionality with UMAP
2022-04-14 18:06:28,773 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [12]:
topic_info = tm.get_topic_info()
len(topic_info)

67

Create output files

In [13]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# create output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [14]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'medauf_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'medauf_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'medauf_bbu_rramicus')
embed.to_csv(output_folder + 'medauf_topics_clean_labels_bbu_rramicus.csv')

# Mpeauo and Mpeauf

## Mpeauo

In [None]:
var_df = df[(df['mpeauo'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'mpeauo', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 19
number of paragraphs: 158


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#var_df['text'] = var_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, #umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

158it [00:12, 12.41it/s]
2022-04-11 18:07:39,463 - BERTopic - Transformed documents to Embeddings
2022-04-11 18:07:42,184 - BERTopic - Reduced dimensionality with UMAP
2022-04-11 18:07:42,258 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
topic_info = tm.get_topic_info()
len(topic_info)

Create output files

In [None]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# create output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [None]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'mpeauo_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'mpeauo_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'mpeauo_bbu_rramicus')
embed.to_csv(output_folder + 'mpeauo_topics_clean_labels_bbu_rramicus.csv')

## Mpeauf

In [15]:
var_df = df[(df['mpeauf'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'mpeauf', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 53
number of paragraphs: 459


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#mpeauf_df['text'] = mpeauf_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [16]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
#umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
#                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, #umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [17]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

459it [00:22, 20.60it/s]
2022-04-14 18:07:33,189 - BERTopic - Transformed documents to Embeddings
2022-04-14 18:07:36,810 - BERTopic - Reduced dimensionality with UMAP
2022-04-14 18:07:36,888 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [18]:
topic_info = tm.get_topic_info()
len(topic_info)

40

Create output files

In [19]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics

output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# create output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [20]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'mpeauf_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'mpeauf_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'mpeauf_bbu_rramicus')
embed.to_csv(output_folder + 'mpeauf_topics_clean_labels_bbu_rramicus.csv')

# Relauo and Relauf

## Relauo

In [None]:
var_df = df[(df['relauo'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'relauo', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 100
number of paragraphs: 768


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#relauo_df['text'] = relauo_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
#umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
#                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, #umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

459it [00:32, 14.32it/s]
2022-04-14 15:29:33,525 - BERTopic - Transformed documents to Embeddings
2022-04-14 15:29:41,884 - BERTopic - Reduced dimensionality with UMAP
2022-04-14 15:29:42,005 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
topic_info = tm.get_topic_info()
len(topic_info)

31

Create output files

In [None]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics

output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# create output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [None]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'relauo_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'relauo_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'relauo_bbu_rramicus')
embed.to_csv(output_folder + 'relauo_topics_clean_labels_bbu_rramicus.csv')

## Relauf

In [None]:
var_df = df[(df['relauf'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'relauf', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 22
number of paragraphs: 170


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#var_df['text'] = var_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
#umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
#                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, #umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

459it [00:32, 14.32it/s]
2022-04-14 15:29:33,525 - BERTopic - Transformed documents to Embeddings
2022-04-14 15:29:41,884 - BERTopic - Reduced dimensionality with UMAP
2022-04-14 15:29:42,005 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
topic_info = tm.get_topic_info()
len(topic_info)

31

Create output files

In [None]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics

output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# save output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [None]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'relauf_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'relauf_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'relauf_bbu_rramicus')
embed.to_csv(output_folder + 'relauf_topics_clean_labels_bbu_rramicus.csv')