# Topic Modeling

Using BERTopic

## Set up environment

you'll need to enable GPUs for the notebook:

- Navigate to Edit→Notebook Settings
- select GPU from the Hardware Accelerator drop-down

In [None]:
!pip install transformers
!pip install torch
!pip install datasets
!pip install bertopic[flair]
#!pip install sentence_transformers
#!pip install bertopic

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd gdrive/My\ Drive/amicus-iv

Mounted at /content/gdrive
/content/gdrive/My Drive/amicus-iv


Saving locations -- change these for different models!

In [None]:
model_folder = 'topic-modeling/models/bert-base-uncased-RRamicus/'
output_folder = 'topic-modeling/output/bert-base-uncased-RRamicus/'

Model checkpoint for generating document embeddings

In [None]:
model_checkpoint = 'repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus'

Import packages

In [None]:
import pandas as pd
import numpy as np
from html import unescape
import string

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter

#from transformers import AutoTokenizer
#from datasets import load_dataset, load_metric, Dataset

from huggingface_hub import notebook_login

from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings

from sklearn.preprocessing import MinMaxScaler
from umap import UMAP
from typing import List
import hdbscan
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Define similarity function

We want to group topics based on how similar they are. This is an adoption of the visualize_topics() function

In [None]:
def get_similar_topics(topic_model,
                     topics: List[int] = None,
                     top_n_topics: int = None,
                     width: int = 650,
                     height: int = 650):
    # Select topics based on top_n and topics args
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1])
    else:
        topics = sorted(list(topic_model.get_topics().keys()))

    # Extract topic words and their frequencies
    topic_list = sorted(topics)
    frequencies = [topic_model.topic_sizes[topic] for topic in topic_list]
    words = [" | ".join([word[0] for word in topic_model.get_topic(topic)[:10]]) for topic in topic_list]

    # seed
    np.random.seed(11)

    # Embed c-TF-IDF into 2D
    all_topics = sorted(list(topic_model.get_topics().keys()))
    indices = np.array([all_topics.index(topic) for topic in topics])
    embeddings = topic_model.c_tf_idf.toarray()[indices]
    embeddings = MinMaxScaler().fit_transform(embeddings)
    embeddings = UMAP(n_neighbors=2, n_components=2, metric='hellinger', random_state=42).fit_transform(embeddings)

    # cluster based on above
    labels = hdbscan.HDBSCAN(min_samples=1, min_cluster_size=3).fit_predict(embeddings)

    # Visualize with plotly
    df = pd.DataFrame({"x": embeddings[1:, 0], "y": embeddings[1:, 1], 'Label':labels[1:],
                       "Topic": topic_list[1:], "Words": words[1:], "Size": frequencies[1:]})
    return df

## Data

BERTopic function takes a list of documents, so we need to set this up ourselves. 

### Read in text from drive

I have saved a file on google drive called "data/amicus_text_512.csv" which contains the result of following the steps of option 1 below. since this produces the same results each time, we don't need to keep re-running it.

In [None]:
df = pd.read_csv('data/amicus_clean_512_no_stride_vars.csv')
df.head(1)

Unnamed: 0,case,id,brief,brief_party,forgau,antauo,chcauf,lifauo,govauf,govauo,...,legauo,relauf,relauo,lg_bauf,forgauf,aclauf,aclauo,sgauf,sgauo,text
0,Rust v Sullivan,861819857503,"Rust v Sullivan. Amici Brief for Respondent, b...",0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,abortion battle conflict enumerated right life...


Remove general noise words

In [None]:
#df_clean = df.copy()
#df_clean['text'] = df_clean['text'].str.replace('[{}]'.format(string.punctuation), '')

rmv_list = ['ii', 'https', 'al', 'et', 'per', 'www', 'llp', 'id', 'nos', 'pdf', 'http',
            'ul', 'fi', 'ri', 'sb', 'ql', 'li', 'fs',
            'circuit', 'district', 'supra', 'supp', 'decisis', 'amici', 'curiae', 'court', 'courts', 'supreme', 'appeals',
            'appeal', 'appellants', 'appellant', 'appellee', 'appellees',
            'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'nineth', 'tenth', 'eleventh', 'twelfth']

df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))
#df_clean['text'] = df_clean['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))
#df_clean['text_2'] = df_clean['text'].apply(lambda x: [word for word in x.split()])

# Govauf and Govauo

## Govauo

In [None]:
var_df = df[(df['govauo'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'govauo', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 65
number of paragraphs: 503


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#var_df['text'] = var_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
#umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
#                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, #umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Downloading:   0%|          | 0.00/664 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/321 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

503it [00:23, 21.79it/s]
2022-04-15 13:54:57,831 - BERTopic - Transformed documents to Embeddings
2022-04-15 13:55:07,602 - BERTopic - Reduced dimensionality with UMAP
2022-04-15 13:55:07,703 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
topic_info = tm.get_topic_info()
len(topic_info)

52

Create output files

In [None]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# create output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [None]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'govauo_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'govauo_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'govauo_bbu_rramicus')
embed.to_csv(output_folder + 'govauo_topics_clean_labels_bbu_rramicus.csv')

## Govauf

In [None]:
var_df = df[(df['govauf'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'govauf', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 48
number of paragraphs: 345


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#var_df['text'] = var_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
#umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
#                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, #umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

345it [00:17, 19.62it/s]
2022-04-15 13:56:29,422 - BERTopic - Transformed documents to Embeddings
2022-04-15 13:56:33,113 - BERTopic - Reduced dimensionality with UMAP
2022-04-15 13:56:33,165 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
topic_info = tm.get_topic_info()
len(topic_info)

28

Create output files

In [None]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# create output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [None]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'govauf_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'govauf_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'govauf_bbu_rramicus')
embed.to_csv(output_folder + 'govauf_topics_clean_labels_bbu_rramicus.csv')

# Racauo and Racauf

## Racauo

In [None]:
var_df = df[(df['racauo'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'racauo', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 5
number of paragraphs: 28


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#var_df['text'] = var_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
#umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
#                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, #umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

28it [00:01, 14.08it/s]
2022-04-15 14:30:47,447 - BERTopic - Transformed documents to Embeddings
2022-04-15 14:30:52,339 - BERTopic - Reduced dimensionality with UMAP
2022-04-15 14:30:52,352 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
topic_info = tm.get_topic_info()
len(topic_info)

4

Create output files

In [None]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# create output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [None]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'racauo_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'racauo_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'racauo_bbu_rramicus')
embed.to_csv(output_folder + 'racauo_topics_clean_labels_bbu_rramicus.csv')

## racauf

In [None]:
var_df = df[(df['racauf'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'racauf', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 11
number of paragraphs: 91


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#var_df['text'] = var_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
#umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
#                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, #umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

91it [00:05, 16.35it/s]
2022-04-15 13:59:40,549 - BERTopic - Transformed documents to Embeddings
2022-04-15 13:59:43,208 - BERTopic - Reduced dimensionality with UMAP
2022-04-15 13:59:43,226 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
topic_info = tm.get_topic_info()
len(topic_info)

6

Create output files

In [None]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# create output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [None]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'racauf_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'racauf_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'racauf_bbu_rramicus')
embed.to_csv(output_folder + 'racauf_topics_clean_labels_bbu_rramicus.csv')

# LGBauf

In [None]:
var_df = df[(df['lg_bauf'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'lg_bauf', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 2
number of paragraphs: 20


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#var_df['text'] = var_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, #umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

20it [00:01, 11.87it/s]
2022-04-15 14:57:43,397 - BERTopic - Transformed documents to Embeddings
2022-04-15 14:57:48,823 - BERTopic - Reduced dimensionality with UMAP
2022-04-15 14:57:48,838 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
topic_info = tm.get_topic_info()
len(topic_info)

3

Create output files

In [None]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics
output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

'''# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# create output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]'''

"# cluster resulting topics using hdbscan\nembed = get_similar_topics(tm)\n\n# create output\nembed = embed.sort_values('Label')\nembed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)\nembed = embed.merge(topics_df, how='left', on = 'topic_id')\nembed.drop(['Words', 'Size'], axis=1, inplace=True)\nembed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]"

Save results

In [None]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'lg_bauf_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'lg_bauf_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'lg_bauf_bbu_rramicus')
#embed.to_csv(output_folder + 'lg_bauf_topics_clean_labels_bbu_rramicus.csv')

# Sg, Sgauf & Sgauo

## sgauf

In [None]:
var_df = df[(df['sgauf'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'sgauf', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 7
number of paragraphs: 54


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#relauo_df['text'] = relauo_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0,
                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

54it [00:03, 17.03it/s]
2022-04-15 14:01:32,973 - BERTopic - Transformed documents to Embeddings
2022-04-15 14:01:33,977 - BERTopic - Reduced dimensionality with UMAP
2022-04-15 14:01:33,994 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
topic_info = tm.get_topic_info()
len(topic_info)

8

Create output files

In [None]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics

output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# create output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [None]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'sgauf_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'sgauf_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'sgauf_bbu_rramicus')
embed.to_csv(output_folder + 'sgauf_topics_clean_labels_bbu_rramicus.csv')

## Sgauo

In [None]:
var_df = df[(df['sgauo'] == 1)]
var_df = var_df[['case', 'id', 'brief', 'sgauo', 'text']]
print("number of briefs:", var_df['id'].nunique())
print("number of paragraphs:", len(var_df))

number of briefs: 11
number of paragraphs: 96


Remove noise

In [None]:
#rmv_list = ['abortion', 'women', 'health', 'respectfully', 'conclusion', 'counsel', 'state',
#            'abortions', 'clinic', 'clinics']
#var_df['text'] = var_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (rmv_list)]))

### Initial train + save

Init existing topic model again so we don't over-write existing model.

In [None]:
# init embeddings
bbu_ft_embed = TransformerDocumentEmbeddings(model_checkpoint)

# init umap
umap_model = UMAP(n_neighbors=2, n_components=2, min_dist=0.0,
                  metric='cosine', random_state=42, low_memory=False)

# init model
tm = BERTopic(embedding_model=bbu_ft_embed, min_topic_size=3, umap_model=umap_model,
                     language='english', calculate_probabilities=True, verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at repro-rights-amicus-briefs/bert-base-uncased-finetuned-RRamicus and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fit the model on only the docs of interest (5 min)

In [None]:
# fit model
topics, probs = tm.fit_transform(var_df['text'])

96it [00:06, 15.37it/s]
2022-04-15 14:55:06,267 - BERTopic - Transformed documents to Embeddings
2022-04-15 14:55:08,363 - BERTopic - Reduced dimensionality with UMAP
2022-04-15 14:55:08,398 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
topic_info = tm.get_topic_info()
len(topic_info)

15

Create output files

In [None]:
# full list of topics
full_topics = tm.get_topics()

#convert full topic dict to df and transpose
topics_df = pd.DataFrame(full_topics,
                         index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10'])\
                         .transpose()

# get just the word
topics_df = topics_df.applymap(lambda x: x[0])

# add col w/concatenated list
#topics_df['all_words'] = topics_df.apply(', '.join, axis=1) #insert at end
topics_df.insert(0, 'topic', topics_df.apply(', '.join, axis=1))

# remove indiv. word columns (word1,...,word10)
topics_df.drop(list(topics_df.filter(regex = 'word')), axis = 1, inplace = True)

# convert index to a column (this is the topic id)
topics_df.insert(0, 'topic_id', topics_df.index)

# add count frequency 
topic_ct = topic_info[['Topic', 'Count']]
topics_df = topics_df.merge(topic_ct, how='left', left_on='topic_id', right_on='Topic')
topics_df.drop('Topic', axis=1, inplace=True)

# classification by paragraph
topic_id = topic_info[['Topic', 'Name']]
output_df = var_df.copy()
output_df['topic_id'] = topics

output_df = output_df.merge(topic_id, how='left', left_on='topic_id', right_on='Topic')
output_df.drop('Topic',axis=1,inplace=True)
output_df.rename({'Name' : 'topic_name'},axis=1, inplace=True)

# cluster resulting topics using hdbscan
embed = get_similar_topics(tm)

# save output
embed = embed.sort_values('Label')
embed.rename({'Topic':'topic_id', 'Label':'label'}, axis=1, inplace=True)
embed = embed.merge(topics_df, how='left', on = 'topic_id')
embed.drop(['Words', 'Size'], axis=1, inplace=True)
embed = embed[['topic_id', 'label', 'topic', 'Count', 'x', 'y']]

Save results

In [None]:
# save -- CHANGE VARIABLE NAMES
topics_df.to_csv(output_folder + 'sgauo_topics_clean_bbu_rramicus.csv', index=False)
output_df.to_csv(output_folder + 'sgauo_topic_clean_classification_bbu_rramicus.csv', index=False)
tm.save(model_folder + 'sgauo_bbu_rramicus')
embed.to_csv(output_folder + 'sgauo_topics_clean_labels_bbu_rramicus.csv')