In [None]:
import json
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import tiktoken
from dotenv import load_dotenv
import wikipediaapi
import requests
from bs4 import BeautifulSoup
import tiktoken
from urllib.parse import unquote
from urllib.request import urlopen
# Cell to import necessary libraries
import re
import os
import openai
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langchain_openai import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
nltk.download('stopwords')

from langchain.chains.openai_functions import (
    create_structured_output_runnable,
    create_structured_output_chain,
)


load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)


In [None]:
#################### For Non Chunked Data ############################



def read_files_from_directory(directory_path: str):
    """
    Read text files from the specified directory.

    :param directory_path: The path to the directory containing text files.
    :return: A dictionary with file names as keys and file contents as values.
    """
    files_content = {}
    for root, dirs, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith(".txt"):
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    files_content[file_path] = file.read()
    print(f"Read {len(files_content)} files from directory {directory_path}")
    return files_content

def split_text(text, max_tokens):
    """
    Split text into smaller chunks based on token limits.

    :param text: The text to be split.
    :param max_tokens: The maximum number of tokens per chunk.
    :return: A list of text chunks.
    """
    enc = tiktoken.get_encoding('gpt2')
    tokens = enc.encode(text)
    chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    text_chunks = [enc.decode(chunk) for chunk in chunks]
    return text_chunks

def extract_information(text: str, title: str) -> dict:
    """
    Extract information from the text using the LLM.

    :param text: The text to be processed.
    :param title: The title of the TV show.
    :return: A JSON object with extracted information.
    """
    max_tokens_per_chunk = 5000 - 385  # Leave room for prompt tokens
    text_chunks = split_text(text, max_tokens_per_chunk)

    combined_results = []

    for chunk in text_chunks:
        prompt = [
            {
                "role": "system",
                "content": f"""I want you to imagine you are reading a text page on the TV show {title} for the first time and have no prior knowledge of TV show {title}.
After reading that text -- I want you to extract a list of people that you can identify from that piece of text. I also want you to give me the smallest fragment of text where you identified that person in the given text. Note there may be multiple references to a person you identify and I want you to list all of them. I also want you to give me potential aliases for each person.
Return your answers in a JSON object. The returned JSON object should be a list of references objects. Where the reference objects are discretionary of the person and text fragments where the person was identified.

An example of a valid response fragment is as follows:

[
    {{
        "person": "Walter White",
        "aliases": [
            "Walt",
            "Walter"
       ]
        "fragments": [
            "Walter White settles into his new surroundings and takes a liking to his new lab assistant Gale Boetticher.",
            "Marie Schrader, who suggests that he ask Walt about it, due to Walt's previous association with Jesse.",
            "He tells them again that he will not allow them to kill Walt until his business with him has concluded, but gives them his blessing to instead go after the man who actually pulled the trigger on Tuco Salamanca: Hank."
        ]
    }},
    {{
        "person": "Gale Boetticher",
       "aliases": ["Gale"]
        "fragments": [
            "Walter White settles into his new surroundings and takes a liking to his new lab assistant Gale Boetticher."
        ]
    }}
]
Below is part of that wikipedia page on the TV show {title}.

"""
            },
            {
                "role": "user",
                "content": f"Below is part of that Wikipedia page on the TV show {title}.\n\n{chunk}"
            }
        ]

        response = llm.invoke(prompt)
        chunk_results = json.loads(response.content)
        combined_results.extend(chunk_results)

    # Combine results by merging fragments and aliases for the same person
    merged_results = {}
    for item in combined_results:
        person = item['person']
        if person not in merged_results:
            merged_results[person] = {
                'person': person,
                'aliases': set(item['aliases']),
                'fragments': set(item['fragments'])
            }
        else:
            merged_results[person]['aliases'].update(item['aliases'])
            merged_results[person]['fragments'].update(item['fragments'])

    # Convert sets back to lists
    for person in merged_results:
        merged_results[person]['aliases'] = list(merged_results[person]['aliases'])
        merged_results[person]['fragments'] = list(merged_results[person]['fragments'])

    return list(merged_results.values())

def save_json_to_file(data: dict, file_path: str):
    """
    Save the JSON data to a file.

    :param data: The data to be saved.
    :param file_path: The path to the file where the data will be saved.
    """
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

def process_cleaned_data(directory_path: str, title: str = None, is_file: bool = False):
    """
    Process all cleaned data files and extract information.

    :param directory_path: The path to the directory containing cleaned text files or the path to a specific file.
    :param title: The title of the TV show (if known).
    :param is_file: A boolean indicating whether the path is a file.
    """
    if is_file:
        # Process a single file
        with open(directory_path, 'r', encoding='utf-8') as file:
            content = file.read()
        title = title or os.path.basename(directory_path).split('.')[0]
        extracted_info = extract_information(content, title)
        json_filename = os.path.basename(directory_path).replace(".txt", ".json")
        json_filepath = os.path.join("data/Entity_json", json_filename)
        save_json_to_file(extracted_info, json_filepath)
        print(f"Processed and saved: {json_filepath}")
    else:
        # Process a directory
        files_content = read_files_from_directory(directory_path)
        if not files_content:
            print("No files found in the directory.")
            return
        for filepath, content in files_content.items():
            # Derive title from directory structure if not provided
            relative_path = os.path.relpath(filepath, directory_path)
            path_parts = relative_path.split(os.sep)
            title = path_parts[0]  # Assuming title is the first part of the path
            source = path_parts[1]  # Assuming source is the second part of the path
            extracted_info = extract_information(content, title)
            json_filename = os.path.basename(filepath).replace(".txt", ".json")
            json_filepath = os.path.join("data/Entity_json", title, source, json_filename)
            save_json_to_file(extracted_info, json_filepath)
            print(f"Processed and saved: {json_filepath}")



In [None]:
# Example usage
# Process a single file
# process_cleaned_data("cleaned_data/Breaking_Bad/wikipedia/sample.txt", title="Breaking Bad", is_file=True)

# Process a directory
process_cleaned_data("data/cleaned_data")