In [1]:
import os
import pandas as pd
import re
from bertopic import BERTopic
from nltk.tokenize import sent_tokenize

In [7]:
# Function to chunk a document into smaller parts (by sentences)
def chunk_document(text, max_chunk_size=1024):
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Splitting text by sentence
    chunks, current_chunk = [], []
    current_length = 0
    
    for sentence in sentences:
        token_count = len(sentence.split())
        if current_length + token_count <= max_chunk_size:
            current_chunk.append(sentence)
            current_length += token_count
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = token_count
    
    # Append the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

In [8]:
DIR = '../TextExtractor/files'

file_contents = [open(os.path.join(DIR, filename), 'r', encoding='utf-8').read() 
                 for filename in os.listdir(DIR) if filename.endswith('.txt')]

docs = [chunk_document(f) for f in file_contents]
docs = [sentence for doc in docs for sentence in doc]


In [9]:
len(docs)

2407

In [10]:
docs[0]

"FEDERAL ENERGY REGULATORY COMMISSION WASHINGTON, DC 20426 August 8, 2019 OFFICE OF ENERGY PROJECTS Project No. 3273-024New York Chittenden Falls Hydropower Project Chittenden Falls Hydropower, Inc. Via Electronic Mail Mark Boumansour Gravity Renewables, Inc. mark@gravityrenewables.com Celeste N. Fay Gravity Renewables, Inc. celeste@gravityrenewables.com Reference: Deficiency of License Application and Additional Information Request Dear Mr. Boumansour or Ms. Fay: Your application for a new license for the Chittenden Falls Hydropower Project (Chittenden Falls Project or project) filed on May 31, 2019, fails to conform to the requirements of the Commission's regulations. A list of deficiencies is enclosed as Schedule A. Under 18 C.F.R. section 16.9(b)(2) of the Commissions regulations, you have 90 days from the date of this letter to correct the deficiencies in the application. Additionally, based on staffs review of the application, additional information is needed before we can conduc

In [19]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('facebook/bart-base')
embeddings = embedding_model.encode(docs, show_progress_bar=True, normalize_embeddings=True)

No sentence-transformers model found with name facebook/bart-base. Creating a new one with mean pooling.


Batches:   0%|          | 0/76 [00:00<?, ?it/s]

In [91]:
from bertopic import BERTopic
from umap import UMAP
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
umap_model = UMAP(n_neighbors=10, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=3, ngram_range=(1, 2))

topic_model = BERTopic(ctfidf_model=ctfidf_model, vectorizer_model=vectorizer_model, hdbscan_model=hdbscan_model, umap_model=umap_model)
topics, probs = topic_model.fit_transform(docs, embeddings)

In [92]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,372,-1_deficiencies_requested_electronic_contact,"[deficiencies, requested, electronic, contact,...",[FEDERAL ENERGY REGULATORY COMMISSION WASHINGT...
1,0,442,0_effects_species_flow_potential,"[effects, species, flow, potential, fish, meas...",[Section 4.51(e)(4)(ii) and (iii) of our regul...
2,1,239,1_shpo_historic_resolve_hpmp,"[shpo, historic, resolve, hpmp, properties, to...",[Section 4.81(d)(1) require that the map(s) sh...
3,2,102,2_extent possible_project features_dimensions_...,"[extent possible, project features, dimensions...",[FEDERAL ENERGY REGULATORY COMMISSION Washingt...
4,3,73,3_existing andor_andor proposed_address teleph...,"[existing andor, andor proposed, address telep...",[FEDERAL ENERGY REGULATORY COMMISSION Washingt...
...,...,...,...,...,...
90,89,6,89_eap_noncompliance_312_drill,"[eap, noncompliance, 312, drill, 2008 letter, ...",[FEDERAL ENERGY REGULATORY COMMISSION Office o...
91,90,6,90_study plans_apea_pad_preapplication,"[study plans, apea, pad, preapplication, study...",[FEDERAL ENERGY REGULATORY COMMISSION WASHINGT...
92,91,6,91_means_sec_governing_small hydroelectric,"[means, sec, governing, small hydroelectric, p...","[(2) The exact name, business address, and tel..."
93,92,5,92_jersey_missed_cape_energy project,"[jersey, missed, cape, energy project, natural...",[FEDERAL ENERGY REGULATORY COMMISSION Washingt...


In [93]:
topic_model.visualize_barchart()

In [75]:
topic_model.visualize_topics()