In [None]:
!pip install bertopic
!python --version

In [1]:
import os
from bertopic import BERTopic
from tqdm import tqdm
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from datetime import datetime

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Boulanger\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Boulanger\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
base_dir = "C:/Users/Boulanger/ownCloud/Langfristvorhaben/Legal-Theory-Graph/Data"
corpus_dir = os.path.join(base_dir, "FULLTEXTS/JLS/jls-txt")
d2y = pd.read_csv('jls-doi-to-year.csv')
stopwords=set(stopwords.words('english'))

In [18]:
# Load the articles
articles = []
timestamps = []
not_found = []
for filename in tqdm(os.listdir(corpus_dir)):
    if filename.endswith(".txt"):
        with open(os.path.join(corpus_dir, filename), "r", encoding="utf-8") as f:
            article = f.read()
            tmp_article = []
            doi = filename.strip(".txt").replace("_","/")
            year = d2y.loc[d2y['DOI'] == doi, 'year']
            # only use articles for which we have a year
            if not year.empty:
                timestamp = datetime.strptime(str(year.values[0]), '%Y').date()
                timestamps.append(timestamp)
                for word in str(article).split():
                    if word.lower() not in stopwords:
                        tmp_article.append(word)
                articles.append(' '.join(tmp_article))
            else:
                not_found.append(doi)

earliest_year = min(timestamps).strftime('%Y')
latest_year = max(timestamps).strftime('%Y')

print(f"Corpus has {len(articles)} articles from {earliest_year} to {latest_year}. Date information is missing for {len(not_found)} articles.")

100%|██████████| 1651/1651 [00:05<00:00, 280.61it/s]

Corpus has 1136 articles from 1974 to 2021. Date information is missing for  514 articles.





In [16]:
# topics for semi-supervised training
import yaml
data_dir = os.path.join(base_dir, "APPDATA/R/analyses/jls")
with open(os.path.join(data_dir, "topics.yml")) as f:
    topics = yaml.safe_load(f)

topics = list(topics.keys())

In [20]:
model = BERTopic(verbose=True)
topics, probs = model.fit_transform(articles)
topics_over_time = model.topics_over_time(articles, timestamps)

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

2023-04-06 15:46:10,198 - BERTopic - Transformed documents to Embeddings
2023-04-06 15:46:13,330 - BERTopic - Reduced dimensionality
2023-04-06 15:46:13,364 - BERTopic - Clustered reduced embeddings
37it [00:48,  1.31s/it]


In [21]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,372,-1_law_legal_social_see
1,0,103,0_law_legal_sociology_social
2,1,81,1_police_court_criminal_guilty
3,2,58,2_family_children_child_marriage
4,3,48,3_law_education_legal_students
5,4,47,4_women_feminist_gender_law
6,5,46,5_aid_legal_social_services
7,6,43,6_law_constitutional_global_transnational
8,7,31,7_ireland_northern_irish_political
9,8,26,8_rights_human_right_social


In [28]:
fig = model.visualize_topics_over_time(topics_over_time)
fig.write_html("topics-over-time.html")

In [27]:
fig = model.visualize_documents(articles)
fig.write_html("document-topics.html")