In [1]:
# Import required packages
import pickle
import re
import numpy as np
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize
import matplotlib.pyplot as plt
import os


In [2]:
# Open and load the pickle files
pickle_file_path = 'file_contents_pdf.pkl'
with open(pickle_file_path, 'rb') as file:
    file_contents_pdf = pickle.load(file)

In [3]:
# Create output folder if it doesn't exist
output_folder = 'contract_topics'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [4]:
# Preprocessing function
def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens


In [5]:
# Split contract into sentences or chunks
def split_contract_into_chunks(text, chunk_size=100):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    chunk_word_count = 0
    
    for sentence in sentences:
        words_in_sentence = len(word_tokenize(sentence))
        if chunk_word_count + words_in_sentence > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            chunk_word_count = words_in_sentence
        else:
            current_chunk.append(sentence)
            chunk_word_count += words_in_sentence

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

In [6]:
# Preprocess each chunk
def preprocess_chunks(chunks):
    processed_chunks = []
    for chunk in chunks:
        processed_chunks.append(preprocess_text(chunk))
    return processed_chunks

In [7]:
# # Split and preprocess the contract
# chunks = split_contract_into_chunks(single_contract_text)
# processed_chunks = preprocess_chunks(chunks)

# # Create a dictionary and corpus for LDA
# dictionary = corpora.Dictionary(processed_chunks)
# corpus = [dictionary.doc2bow(chunk) for chunk in processed_chunks]

In [8]:
# Function to compute coherence score for different number of topics
def compute_coherence_values(dictionary, corpus, texts, start=2, limit=20, step=1):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=num_topics, 
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
        model_list.append(model)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
    return model_list, coherence_values

In [9]:
# Function to find the optimal number of topics
def get_optimal_topics(corpus, dictionary, texts, start=2, limit=10, step=1):
    model_list, coherence_values = compute_coherence_values(dictionary, corpus, texts, start=start, limit=limit, step=step)
    
    # Find the model with the highest coherence score
    optimal_index = coherence_values.index(max(coherence_values))
    optimal_model = model_list[optimal_index]
    optimal_num_topics = range(start, limit, step)[optimal_index]
    
    print(f"Optimal number of topics: {optimal_num_topics} with coherence score: {coherence_values[optimal_index]}")
    
    return optimal_model, optimal_num_topics

In [10]:
# Loop over each contract in the dictionary
for contract_name, contract_text in file_contents_pdf.items():
    # Save the DataFrame to a CSV file with the contract name
    output_path = os.path.join(output_folder, f"{contract_name}.csv")
    
    # Check if the CSV already exists; if yes, skip processing
    if os.path.exists(output_path):
        print(f"CSV for {contract_name} already exists. Skipping.")
        continue
    
    print(f"Processing contract: {contract_name}")
    
    # Split and preprocess the contract
    chunks = split_contract_into_chunks(contract_text)
    processed_chunks = preprocess_chunks(chunks)
    
    # Create dictionary and corpus for the LDA model
    dictionary = corpora.Dictionary(processed_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in processed_chunks]
    
    # Determine the optimal number of topics
    optimal_model, optimal_num_topics = get_optimal_topics(corpus, dictionary, processed_chunks, start=2, limit=20, step=1)
    
    # Associate each chunk with a dominant topic
    chunk_topic_mapping = []
    for i, chunk_bow in enumerate(corpus):
        topic_distribution = optimal_model.get_document_topics(chunk_bow)
        dominant_topic = sorted(topic_distribution, key=lambda x: x[1], reverse=True)[0][0]
        chunk_topic_mapping.append((chunks[i], dominant_topic))
    
    # Group the data by topic and store in a DataFrame
    topic_data = {
        'Contract_name': [],
        'Topic_heading': [],
        'Topic_text': []
    }

    for chunk, topic in chunk_topic_mapping:
        topic_data['Contract_name'].append(contract_name)
        topic_data['Topic_heading'].append(f'Topic {topic}')
        topic_data['Topic_text'].append(chunk)

    # Create a pandas DataFrame
    df = pd.DataFrame(topic_data)

    
    df.to_csv(output_path, index=False)
    
    print(f"Saved {contract_name}.csv to {output_folder}")

CSV for 2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement.pdf already exists. Skipping.
CSV for ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT AGREEMENT.PDF already exists. Skipping.
CSV for ADMA BioManufacturing, LLC -  Amendment #3 to Manufacturing Agreement .PDF already exists. Skipping.


In [11]:
# # Find the optimal number of topics
# optimal_num_topics = x[np.argmax(coherence_values)]
# print(f"Optimal number of topics: {optimal_num_topics}")

In [12]:
# # Train the final LDA model with the optimal number of topics
# optimal_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                                 id2word=dictionary,
#                                                 num_topics=optimal_num_topics, 
#                                                 random_state=100,
#                                                 update_every=1,
#                                                 chunksize=100,
#                                                 passes=10,
#                                                 alpha='auto',
#                                                 per_word_topics=True)

In [13]:
# # Print the topics
# topics = optimal_model.print_topics(num_words=20)
# for topic in topics:
#     print(topic)

In [14]:
# # Associate each chunk with a dominant topic
# chunk_topic_mapping = []
# for i, chunk_bow in enumerate(corpus):
#     topic_distribution = optimal_model.get_document_topics(chunk_bow)
#     dominant_topic = sorted(topic_distribution, key=lambda x: x[1], reverse=True)[0][0]
#     chunk_topic_mapping.append((chunks[i], dominant_topic))

In [15]:
# # Group and print the chunks under each topic
# topic_chunks = defaultdict(list)
# for chunk, topic in chunk_topic_mapping:
#     topic_chunks[topic].append(chunk)

In [16]:
# # Print the text under each topic
# for topic, topic_chunk_list in topic_chunks.items():
#     print(f"\nTopic {topic}:")
#     for idx, chunk in enumerate(topic_chunk_list, 1):
#         print(f"Chunk {idx}: {chunk}\n")

In [17]:
# # Group the data by topic and store in a DataFrame
# import pandas as pd
# topic_data = {
#     'Topic_heading': [],
#     'Topic_text': []
# }

# for chunk, topic in chunk_topic_mapping:
#     topic_data['Topic_heading'].append(f'Topic {topic}')
#     topic_data['Topic_text'].append(chunk)

# # Create a pandas DataFrame
# df = pd.DataFrame(topic_data)

# # Display the DataFrame
# print(df)
