In [None]:

import json
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import tiktoken
from dotenv import load_dotenv
import wikipediaapi
import requests
from bs4 import BeautifulSoup
import tiktoken
from urllib.parse import unquote
from urllib.request import urlopen
# Cell to import necessary libraries
import re
import os
import openai
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langchain_openai import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
nltk.download('stopwords')

from langchain.chains.openai_functions import (
    create_structured_output_runnable,
    create_structured_output_chain,
)

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
# Initialize ChatOpenAI with the model and API key
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)


In [None]:
####################### For Chunked Data ############################

def read_files_from_directory(directory_path: str):
    """
    Read text files from the specified directory.
    """
    files_content = {}
    for root, dirs, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith(".txt"):
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    files_content[file_path] = file.read()
    print(f"Read {len(files_content)} files from directory {directory_path}")
    return files_content


def extract_information(llm, text: str, title: str) -> dict:
    prompt = {
        "role": "system",
        "content": f"""
        I am analyzing the TV show '{title}' and need to extract detailed information about characters to understand plot dynamics better. Please list all identifiable characters from the text below. For each character, provide:
        - A short fragment of text that captures their essence or a significant action.
        - Potential aliases.
        Fragments should not mention lists of character names from the TV show or movie. Provide the smallest fragment with contextual meaning.
        The response should be concise and structured in JSON format to facilitate relationship analysis in future iterations.

        An example of a valid response:
        [
            {{
                "character": "Jon Snow",
                "aliases": ["Lord Snow", "The White Wolf"],
                "fragment": "Jon Snow pledges his life to the Night's Watch and refuses to leave even when tempted."
            }},
            {{
                "character": "Daenerys Targaryen",
                "aliases": ["Dany", "Khaleesi"],
                "fragment": "Daenerys sets sail for Westeros with her armies and dragons, aiming to reclaim her family's throne."
            }}
        ]
        """
    }
    
    user_prompt = {
        "role": "user",
        "content": f"Below is part of that Wikipedia page on the TV show {title}.\n\n{text}"
    }

    combined_prompts = [prompt, user_prompt]  # This should be a list of dictionaries

    response = llm.invoke(combined_prompts)
    try:
        response_json = json.loads(response.content)
        return response_json
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        print(f"Received malformed JSON response: {response.content}")
        return []  # Return an empty list to indicate failure


def save_json_to_file(data: dict, file_path: str):
    """
    Save the JSON data to a file.
    """
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

def process_cleaned_data(directory_path: str, title: str = None, is_file: bool = False):
    """
    Process all cleaned data files and extract information.
    """
    if is_file:
        # Process a single file
        with open(directory_path, 'r', encoding='utf-8') as file:
            content = file.read()
        title = title or os.path.basename(directory_path).split('.')[0]
        print(title)
        # print(content)
        extracted_info = extract_information(llm,content, title)
        json_filename = os.path.basename(directory_path).replace(".txt", ".json")
        json_filepath = os.path.join("data/Entity_json", json_filename)
        save_json_to_file(extracted_info, json_filepath)
        print(f"Processed and saved: {json_filepath}")
    else:
        # Process a directory
        files_content = read_files_from_directory(directory_path)
        if not files_content:
            print("No files found in the directory.")
            return
        for filepath, content in files_content.items():
            title = os.path.basename(filepath).split('_')[0]  # Derive title from filename
            print("fdfdsfndsnfsdjkfnsdkjfnsdknf")
            print(title)
            # print(content)
            extracted_info = extract_information(llm,content, title)
            json_filename = os.path.basename(filepath).replace(".txt", ".json")
            json_filepath = os.path.join("data/Entity_json", title, json_filename)
            save_json_to_file(extracted_info, json_filepath)
            print(f"Processed and saved: {json_filepath}")


In [None]:

# Example usage
# Process a directory
process_cleaned_data("data/cleaned_data")
