In [1]:
import os
import fitz # PyMuPDF
import nltk
import string
import sklearn
from umap import UMAP
from bertopic import BERTopic
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer

In [2]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')


print(stopwords.words('english'))

[nltk_data] Downloading package wordnet to /home/birdjj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/birdjj/nltk_data...


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/birdjj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:

'''
May be beneficial if team wants all authors completed
'''
def find_files_in_directory(folder_path):
    print(folder_path)
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    return files

# Call the function to get a list of file names
file_names = find_files_in_directory('aekowals')
print(len(file_names))

aekowals


FileNotFoundError: [Errno 2] No such file or directory: 'aekowals'

In [4]:
#function reads PDF files and turns text to lowercase
def read_pdf(file_path):
    print(file_path)
    text = ""
    with fitz.open(file_path) as doc:
        for page_num in range(doc.page_count):
            page = doc[page_num]
            text += page.get_text("text").lower()
            
    tokens = word_tokenize(text)            
    return text

In [5]:
def preprocess_text(text):

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))    
    processed = []

    for token in word_tokenize(text):
        token = token.lower()
        lem_token = lemmatizer.lemmatize(token)
        
        if len(lem_token) > 4 and lem_token.isalpha() and lem_token not in string.punctuation and lem_token not in stop_words:
            processed.append(lem_token)

    #print(processed)

    return " ".join(processed)

In [89]:
def create_docs(file_names, dir_path):
    print(type(file_names))
    
    docs = []
    for file in file_names:
        #print('file', file)
        try:
            text = read_pdf(os.path.join(dir_path, file))
            #print('text',text)            
            preprocessed_text = preprocess_text(text)
            docs.append(preprocessed_text)            
        except Exception as e:
            print(e)


            
    return docs

In [170]:
def BERTmodel(docs):
    num_docs = len(docs)
    print(f"Number of documents: {num_docs}")

    if num_docs < 2:
        print("Not enough documents for topic modeling.")
        return ["No topics"], [None]

    # Generate embeddings
    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = sentence_model.encode(docs)

    print(f"Embedding shape: {embeddings.shape}")

    if num_docs <= 5:
        # Small dataset logic: Skip UMAP
        print("Fewer than or equal to 5 documents - skipping UMAP.")
        model = BERTopic()
        topics, probs = model.fit_transform(docs)
        print("Topics:", topics)  # Print topics here
        return topics, probs

    # For larger datasets, configure UMAP
    from umap import UMAP

    n_neighbors = min(15, max(2, num_docs - 1))  # Ensure n_neighbors < num_docs
    n_components = min(5, num_docs - 1)  # Ensure n_components < num_docs

    print(f"UMAP configured with n_neighbors={n_neighbors}, n_components={n_components}")
    umap_model = UMAP(n_components=n_components, n_neighbors=n_neighbors, random_state=42)

    # Fit BERTopic with UMAP
    model = BERTopic(umap_model=umap_model, top_n_words=10)
    topics, probs = model.fit_transform(docs)
    print("Topics:", topics)  # Print topics here
    return topics, probs

In [177]:

general = r'/home/birdjj/expert_field_project/full_pdfs_by_author/'


dir_path = r'/home/birdjj/expert_field_project/full_pdfs_by_author/revathy'

directory = find_files_in_directory(dir_path)

print('dir',directory)

#docs = create_docs(directory, dir_path)

print('documents', docs)






/home/birdjj/expert_field_project/full_pdfs_by_author/revathy
dir ['00461520.2018.1432362.pdf', '978-981-15-8530-2_19.pdf', '0022487112466899.pdf', '00461520.2018.1432361?needAccess=true.pdf', '0002831217738508.pdf', 'xml.pdf', 'pmc4586172?pdf=render.pdf', 'pmc3706520?pdf=render.pdf', 'DownloadRepFile.aspx?docID=1439679&Version=1&fileExtension=.pdf', '11121_2004_Article_373225.pdf', '00461520.2023.2250879?needAccess=true.pdf']


In [178]:
num_docs = len(docs)
print(f"Number of documents: {num_docs}")

    # Generate embeddings
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = sentence_model.encode(docs)

print(f"Embedding shape: {embeddings.shape}")

if num_docs <= 5:
# Small dataset logic: Skip UMAP
    print("Fewer than or equal to 5 documents - skipping UMAP.")
    model = BERTopic()
    topics, probs = model.fit_transform(docs)

    # For larger datasets, configure UMAP
from umap import UMAP

n_neighbors = min(15, max(2, num_docs - 1))  # Ensure n_neighbors < num_docs
n_components = min(5, num_docs - 1)  # Ensure n_components < num_docs

print(f"UMAP configured with n_neighbors={n_neighbors}, n_components={n_components}")
umap_model = UMAP(n_components=n_components, n_neighbors=n_neighbors, random_state=42)

    # Fit BERTopic with UMAP
model = BERTopic(umap_model=umap_model, top_n_words=10)
topics, probs = model.fit_transform(docs)

Number of documents: 18


SyntaxError: 'return' outside function (3468558351.py, line 6)

In [174]:
from collections import defaultdict
general = r'/nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/'

docs_dict = defaultdict(list)
processed_docs = defaultdict(list)
list_dirs = []
expert_fields = {}

for author in os.listdir(general):
    #print(f'-----{author}------')
    if author not in docs_dict:
        docs_dict[author] = []
    dir_path = fr'/nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/{author}'
    list_dirs.append(dir_path)
    #print(dir_path)
    #print(dir_path)
    for file in os.listdir(dir_path):
        docs_dict[author].append(file)

'''for path in list_dirs:
    print(path)'''

for (author, files), dirs in zip(docs_dict.items(), list_dirs):
    #print(author, dirs
    if author in dirs:
        #print(f'Processing Documents for {author}')
        docs = create_docs(files, dirs)
        if len(docs)<10:
            continue
        else:
            print(f'Running BERT on {author}')
            topics, probs = BERTmodel(docs)
            print(model.get_topic_info())
                            
    

#print(len(processed_docs.values()))

<class 'list'>
Failed to open file '/nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/jdos/xml.pdf'.
Failed to open file '/nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/jdos/589297.pdf'.
<class 'list'>
Failed to open file '/nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/minjoon/do.php?a=current&b=11&bidx=576&aidx=7125.pdf'.
Failed to open file '/nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/minjoon/fulltext.html.pdf'.
Failed to open file '/nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/minjoon/xml.pdf'.
<class 'list'>
Failed to open file '/nfs/turbo/si-acastel/expert_field_project/full_pdfs_by_author/revathy/xml.pdf'.
Running BERT on revathy
Number of documents: 10
Embedding shape: (10, 384)
UMAP configured with n_neighbors=9, n_components=5
Topics: [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
   Topic  Count                                   Name  \
0     -1     30  -1_health_individual_insurance_reform   

       

KeyboardInterrupt: 