In [2]:
# Cell to install necessary packages
! pip install wikipedia-api requests beautifulsoup4
%pip install python-dotenv
%pip install --quiet langchain openai langchain-openai unstructured



Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import wikipediaapi
import requests
from bs4 import BeautifulSoup
import os
from datetime import datetime
from dotenv import load_dotenv

In [None]:
# Function to create directories based on title and source
def create_directory_structure(base_dir, title, source):
    dir_path = os.path.join(base_dir, title, source)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    return dir_path


In [None]:
# Function to save data to a file with structured naming
def save_data(data, dir_path, title, source):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    file_name = f"{title}_{source}_{timestamp}.txt"
    file_path = os.path.join(dir_path, file_name)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(data)
    return file_path


In [None]:
# Main function to scrape and save data
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import wikipediaapi

def scrape_and_save(title, url=None):
    base_dir = "data/scraped_data"
    if url:
        source = url.split("//")[-1].split("/")[0]
        data = scrape_general(url)
    else:
        source = "wikipedia"
        data = scrape_wikipedia(title)
    
    dir_path = create_directory_structure(base_dir, title, source)
    file_path = save_data(data, dir_path, title, source)
    return file_path


In [None]:
#################### For scraping and saving data to scraped_data but not efficient scrapping ############################
title = "Interstellar_(film)"
wikipedia_url = "https://en.wikipedia.org/wiki/Interstellar_(film)"
# fandom_url = "https://interstellarfilm.fandom.com/wiki/Interstellar_Wiki"

# Scrape and save data from Wikipedia
wiki_file_path = scrape_and_save(title)
print(f"Data saved to: {wiki_file_path}")

# # Scrape and save data from Fandom
# fandom_file_path = scrape_and_save(title, fandom_url)
# print(f"Data saved to: {fandom_file_path}")

In [None]:
import requests
from bs4 import BeautifulSoup
import os

def scrape_wikipedia_content(url, depth=0, max_depth=1, iteration=1, max_iterations=20, visited=None):
    if visited is None:
        visited = set()  # Initialize the set of visited URLs
    
    if depth > max_depth or iteration > max_iterations:
        print(f"Stopping recursion at depth {depth} and iteration {iteration}")
        return  # Stop recursion based on depth and iteration limits

    if url in visited:
        print(f"Already visited {url}")
        return  # Avoid re-scraping the same URL
    visited.add(url)

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    title = soup.find(id='firstHeading').text.replace('/', '_')  # Sanitize title for file path
    dir_path = os.path.join('data/rough_scraped_data', title)
    os.makedirs(dir_path, exist_ok=True)

    content = soup.find(id='mw-content-text')
    if content:
        content_text = content.get_text()
        file_path = os.path.join(dir_path, f"{title}_Wikipedia_chunk{iteration}.txt")
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content_text)
        print(f"Saved content to {file_path}")

    # Recurse only if under max iterations
    if iteration < max_iterations:
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('/wiki/') and not ':' in href:
                full_url = f'https://en.wikipedia.org{href}'
                if full_url not in visited:
                    print(f"Recursing into {full_url} at iteration {iteration+1}")
                    scrape_wikipedia_content(full_url, depth + 1, max_depth, iteration + 1, max_iterations, visited)


In [None]:

# Example usage
scrape_wikipedia_content('https://en.wikipedia.org/wiki/Interstellar_(film)')


In [None]:
#########################Scrape further links for Fetched Entities of Chatacters like acotes, director etc ##################
import os
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote

def read_all_json_files(directory_path):
    """Read all JSON files in the given directory and extract unique character names and aliases."""
    names = set()
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
            for item in data:
                names.add(item['character'])
                names.update(item.get('aliases', []))
    return names

def find_relevant_links(soup, names):
    """Find and return links that are relevant based on the names provided."""
    relevant_links = []
    for link in soup.find_all('a', href=True, title=True):
        if any(name in link['title'] for name in names):
            relevant_links.append(link['href'])
    return relevant_links

def scrape_relevant_content(start_url, root_json_dir):
    """Scrape content from Wikipedia based on character names extracted from JSON files in a specific directory."""
    title = unquote(start_url.split('/')[-1]).replace('_', ' ')
    json_dir_path = os.path.join(root_json_dir, title)
    
    if not os.path.exists(json_dir_path):
        
        print(f"No JSON directory found for title: {title}")
        return
    
    names = read_all_json_files(json_dir_path)
    response = requests.get(start_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    relevant_links = find_relevant_links(soup, names)
    base_url = 'https://en.wikipedia.org'

    for href in relevant_links:
        full_url = base_url + href
        response = requests.get(full_url)
        page_soup = BeautifulSoup(response.text, 'html.parser')
        page_title = unquote(href.split('/')[-1]).replace('_', ' ')
        dir_path = os.path.join('data/targeted_scraped_data', page_title)
        os.makedirs(dir_path, exist_ok=True)
        file_path = os.path.join(dir_path, f"{page_title}.txt")

        # Extract and save only the textual content from the article
        text_content = page_soup.find('div', id='mw-content-text').get_text(separator='\n', strip=True)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(text_content)
        print(f"Saved content related to {page_title} to {file_path}")


In [None]:

# Example usage
scrape_relevant_content('https://en.wikipedia.org/wiki/Breaking_Bad', 'data/Entity_json')


In [None]:
#################### Updated tool for Scrapping ############################
import requests
from bs4 import BeautifulSoup
import os
import datetime

def fetch_page(url):
    response = requests.get(url)
    return response.text

def parse_page(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    
    # Extract the title for the directory name
    title_tag = soup.find('h1')
    title_text = title_tag.text.replace('/', '_')  # Replace '/' in titles with '_' to avoid path issues
    
    # Extract all text from the main content
    content = soup.find('div', {'id': 'mw-content-text'})
    text = content.get_text() if content else ""
    
    # Find links within the content that include the main title
    links = [a['href'] for a in soup.find_all('a', href=True) if a.text and title_text in a.text]
    
    # Convert relative links to absolute
    links = ['https://en.wikipedia.org' + link if link.startswith('/wiki/') else link for link in links]
    
    return text, links, title_text
    

def scrape_wikipedia(start_url):
    html = fetch_page(start_url)
    text, links, title = parse_page(html, start_url)
    
    # For simplicity, just fetch text from the first few relevant links
    for link in links[:10]:  # Limit to first 10 links to avoid too many requests
        html = fetch_page(link)
        page_text, _ , _ = parse_page(html, link)  # We don't follow further links here
        text += "\n\n" + page_text

    base_dir = os.path.join('data/scraped_data', title)
    os.makedirs(base_dir, exist_ok=True)  # Create directory if it does not exist
    
    # Format current datetime for the filename
    now = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    filename = f"{title}_{now}_Wikipedia.txt"
    filepath = os.path.join(base_dir, filename)
    
    # Save the content to a text file
    with open(filepath, 'w', encoding='utf-8') as file:
        file.write(text)

    return text, title


In [None]:

# Example usage
url = 'https://en.wikipedia.org/wiki/Scent_of_a_Woman_(1992_film)'
collected_text, title = scrape_wikipedia(url)
print(len(collected_text))
print(f"Data collected for {title} and saved to the respective directory.")


In [6]:
# Cell to import necessary libraries
import re
import os
import openai
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langchain_openai import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
nltk.download('stopwords')
from langchain.chat_models import ChatOpenAI
from langchain.chains.openai_functions import (
    create_structured_output_runnable,
    create_structured_output_chain,
)




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ATIFHANIF/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
##################### For cleaning scrapped Data and dividing it into chunks on the basis of tokenization ############################
import os
import re
from nltk.tokenize import word_tokenize

def normalize_text(text):
    # Normalize whitespace and remove special characters
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()

def chunk_text(text, chunk_size=8000):
    words = word_tokenize(text)
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    # Handle the last chunk; merge with previous if too small
    if len(chunks) > 1 and len(word_tokenize(chunks[-1])) < 1000:
        chunks[-2] += ' ' + chunks.pop()
    return chunks

def save_chunks(chunks, base_dir, filename):
    os.makedirs(base_dir, exist_ok=True)
    for i, chunk in enumerate(chunks):
        chunk_filename = f"{filename}_chunk{i+1}.txt"
        chunk_path = os.path.join(base_dir, chunk_filename)
        with open(chunk_path, 'w', encoding='utf-8') as file:
            file.write(chunk)

def clean_data(scraped_dir, cleaned_dir):
    for title in os.listdir(scraped_dir):
        title_path = os.path.join(scraped_dir, title)
        cleaned_title_path = os.path.join(cleaned_dir, title)
        
        if not os.path.exists(cleaned_title_path):
            os.makedirs(cleaned_title_path, exist_ok=True)
        
        for file in os.listdir(title_path):
            file_path = os.path.join(title_path, file)
            cleaned_file_path = os.path.join(cleaned_title_path, file.replace('.txt', ''))
            
            # Check if already cleaned
            if not any(f.startswith(file.replace('.txt', '')) for f in os.listdir(cleaned_title_path)):
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                
                # Clean the text
                cleaned_text = normalize_text(text)
                chunks = chunk_text(cleaned_text)
                
                # Save cleaned chunks
                save_chunks(chunks, cleaned_title_path, file.replace('.txt', ''))
                print(f"Data cleaned and saved for {file_path}")
            else:
                print(f"Already cleaned data present for {file_path}")



In [None]:

# Example usage
scraped_dir = 'data/scraped_data'
cleaned_dir = 'data/cleaned_data'
clean_data(scraped_dir, cleaned_dir)

In [None]:
# Cell to define the text normalization function
def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
# Cell to define the function to remove unnecessary characters
def remove_unnecessary_characters(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text


In [None]:
# Cell to define the context-based cleaning function using LangChain and OpenAI API
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains.openai_functions import create_structured_output_runnable
import tiktoken
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0,)

def context_based_cleaning(input_text: str, context: str) -> str:
    """
    Clean the input text based on the given context using ChatOpenAI.

    :param input_text: The text to be cleaned.
    :param context: The context to guide the cleaning process.
    :return: The cleaned text.
    """
    def split_text(text, max_tokens):
        # Tokenize the text using tiktoken
        enc = tiktoken.get_encoding('gpt2')
        tokens = enc.encode(text)
        
        # Split the tokens into chunks of max_tokens
        chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
        
        # Decode the token chunks back to text
        text_chunks = [enc.decode(chunk) for chunk in chunks]
        
        return text_chunks

    max_tokens_per_chunk = 5000 - 385  # Leaving room for prompt tokens

    # Split the input text into manageable chunks
    text_chunks = split_text(input_text, max_tokens_per_chunk)

    cleaned_chunks = []
    for chunk in text_chunks:
        messages = [
            {
                "role": "system",
                "content": f"""You are a text cleaning assistant. Your task is to clean the input text based on the provided context.
    ## Context
    {context}
    ## Instructions
    - Remove irrelevant information.
    - Correct grammatical errors.
    - Ensure the text is clear and concise.
    - Maintain the original meaning as much as possible.
    ## Output
    Provide the cleaned version of the text in the same language as the input."""
            },
            {
                "role": "user",
                "content": f"Clean the following text based on the context:\n\n{chunk}"
            }
        ]

        response = llm.invoke(messages)
        cleaned_chunks.append(response.content)

    # Concatenate the cleaned chunks to get the full cleaned text
    cleaned_text = "\n".join(cleaned_chunks)
    
    return cleaned_text


# Example usage
# input_text = "Ths is a smple textt with somee speling mistkes and extraneous infformation dsfsdda44 54454 sfdfad.;;';."
# context = "This text is a part of a formal document and should be cleaned accordingly."

# cleaned_text = clean_text(input_text, context)

In [None]:
# Cell to define the main cleaning function
def clean_data(text):
    context = "This text is a part of a formal document and should be cleaned accordingly.You are a helpful assistant. Clean and standardize the following text to make it more readable and consistent."
    text = normalize_text(text)
    text = remove_unnecessary_characters(text)
    # text = context_based_cleaning(text,context)
    return text


In [None]:
# Cell to create directories for cleaned data based on title and source
def create_cleaned_directory_structure(base_dir, title, source):
    dir_path = os.path.join(base_dir, title, source)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    return dir_path


In [None]:
# Cell to clean the scraped data and save it in the cleaned_data directory
def clean_and_save_scraped_data(scraped_dir, cleaned_dir):
    for root, dirs, files in os.walk(scraped_dir):
        for file in files:
            if file.endswith(".txt"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = f.read()
                
                cleaned_data = clean_data(data)
                
                # Extract title and source from the file path
                relative_path = os.path.relpath(file_path, scraped_dir)
                title, source = relative_path.split(os.sep)[:2]
                
                cleaned_dir_path = create_cleaned_directory_structure(cleaned_dir, title, source)
                cleaned_file_path = os.path.join(cleaned_dir_path, file)
                
                with open(cleaned_file_path, 'w', encoding='utf-8') as f:
                    f.write(cleaned_data)
                
                print(f"Cleaned data saved to: {cleaned_file_path}")


In [None]:

scraped_dir = "data/scraped_data"
cleaned_dir = "data/cleaned_data"

clean_and_save_scraped_data(scraped_dir, cleaned_dir)


In [5]:
import os
import json
from langchain_openai import ChatOpenAI
import tiktoken
# print(api)
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)


In [None]:
#################### For Non Chunked Data ############################

import os
import json
from langchain_openai import ChatOpenAI
import tiktoken
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)


def read_files_from_directory(directory_path: str):
    """
    Read text files from the specified directory.

    :param directory_path: The path to the directory containing text files.
    :return: A dictionary with file names as keys and file contents as values.
    """
    files_content = {}
    for root, dirs, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith(".txt"):
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    files_content[file_path] = file.read()
    print(f"Read {len(files_content)} files from directory {directory_path}")
    return files_content

def split_text(text, max_tokens):
    """
    Split text into smaller chunks based on token limits.

    :param text: The text to be split.
    :param max_tokens: The maximum number of tokens per chunk.
    :return: A list of text chunks.
    """
    enc = tiktoken.get_encoding('gpt2')
    tokens = enc.encode(text)
    chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    text_chunks = [enc.decode(chunk) for chunk in chunks]
    return text_chunks

def extract_information(text: str, title: str) -> dict:
    """
    Extract information from the text using the LLM.

    :param text: The text to be processed.
    :param title: The title of the TV show.
    :return: A JSON object with extracted information.
    """
    max_tokens_per_chunk = 5000 - 385  # Leave room for prompt tokens
    text_chunks = split_text(text, max_tokens_per_chunk)

    combined_results = []

    for chunk in text_chunks:
        prompt = [
            {
                "role": "system",
                "content": f"""I want you to imagine you are reading a text page on the TV show {title} for the first time and have no prior knowledge of TV show {title}.
After reading that text -- I want you to extract a list of people that you can identify from that piece of text. I also want you to give me the smallest fragment of text where you identified that person in the given text. Note there may be multiple references to a person you identify and I want you to list all of them. I also want you to give me potential aliases for each person.
Return your answers in a JSON object. The returned JSON object should be a list of references objects. Where the reference objects are discretionary of the person and text fragments where the person was identified.

An example of a valid response fragment is as follows:

[
    {{
        "person": "Walter White",
        "aliases": [
            "Walt",
            "Walter"
       ]
        "fragments": [
            "Walter White settles into his new surroundings and takes a liking to his new lab assistant Gale Boetticher.",
            "Marie Schrader, who suggests that he ask Walt about it, due to Walt's previous association with Jesse.",
            "He tells them again that he will not allow them to kill Walt until his business with him has concluded, but gives them his blessing to instead go after the man who actually pulled the trigger on Tuco Salamanca: Hank."
        ]
    }},
    {{
        "person": "Gale Boetticher",
       "aliases": ["Gale"]
        "fragments": [
            "Walter White settles into his new surroundings and takes a liking to his new lab assistant Gale Boetticher."
        ]
    }}
]
Below is part of that wikipedia page on the TV show {title}.

"""
            },
            {
                "role": "user",
                "content": f"Below is part of that Wikipedia page on the TV show {title}.\n\n{chunk}"
            }
        ]

        response = llm.invoke(prompt)
        chunk_results = json.loads(response.content)
        combined_results.extend(chunk_results)

    # Combine results by merging fragments and aliases for the same person
    merged_results = {}
    for item in combined_results:
        person = item['person']
        if person not in merged_results:
            merged_results[person] = {
                'person': person,
                'aliases': set(item['aliases']),
                'fragments': set(item['fragments'])
            }
        else:
            merged_results[person]['aliases'].update(item['aliases'])
            merged_results[person]['fragments'].update(item['fragments'])

    # Convert sets back to lists
    for person in merged_results:
        merged_results[person]['aliases'] = list(merged_results[person]['aliases'])
        merged_results[person]['fragments'] = list(merged_results[person]['fragments'])

    return list(merged_results.values())

def save_json_to_file(data: dict, file_path: str):
    """
    Save the JSON data to a file.

    :param data: The data to be saved.
    :param file_path: The path to the file where the data will be saved.
    """
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

def process_cleaned_data(directory_path: str, title: str = None, is_file: bool = False):
    """
    Process all cleaned data files and extract information.

    :param directory_path: The path to the directory containing cleaned text files or the path to a specific file.
    :param title: The title of the TV show (if known).
    :param is_file: A boolean indicating whether the path is a file.
    """
    if is_file:
        # Process a single file
        with open(directory_path, 'r', encoding='utf-8') as file:
            content = file.read()
        title = title or os.path.basename(directory_path).split('.')[0]
        extracted_info = extract_information(content, title)
        json_filename = os.path.basename(directory_path).replace(".txt", ".json")
        json_filepath = os.path.join("data/Entity_json", json_filename)
        save_json_to_file(extracted_info, json_filepath)
        print(f"Processed and saved: {json_filepath}")
    else:
        # Process a directory
        files_content = read_files_from_directory(directory_path)
        if not files_content:
            print("No files found in the directory.")
            return
        for filepath, content in files_content.items():
            # Derive title from directory structure if not provided
            relative_path = os.path.relpath(filepath, directory_path)
            path_parts = relative_path.split(os.sep)
            title = path_parts[0]  # Assuming title is the first part of the path
            source = path_parts[1]  # Assuming source is the second part of the path
            extracted_info = extract_information(content, title)
            json_filename = os.path.basename(filepath).replace(".txt", ".json")
            json_filepath = os.path.join("data/Entity_json", title, source, json_filename)
            save_json_to_file(extracted_info, json_filepath)
            print(f"Processed and saved: {json_filepath}")

# Example usage
# Process a single file
# process_cleaned_data("cleaned_data/Breaking_Bad/wikipedia/sample.txt", title="Breaking Bad", is_file=True)

# Process a directory
process_cleaned_data("data/cleaned_data")

In [None]:
####################### For Chunked Data ############################

import os
import json
from langchain_openai import ChatOpenAI

# Initialize ChatOpenAI with the model and API key
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def read_files_from_directory(directory_path: str):
    """
    Read text files from the specified directory.
    """
    files_content = {}
    for root, dirs, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith(".txt"):
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    files_content[file_path] = file.read()
    print(f"Read {len(files_content)} files from directory {directory_path}")
    return files_content


def extract_information(llm, text: str, title: str) -> dict:
    prompt = {
        "role": "system",
        "content": f"""
        I am analyzing the TV show '{title}' and need to extract detailed information about characters to understand plot dynamics better. Please list all identifiable characters from the text below. For each character, provide:
        - A short fragment of text that captures their essence or a significant action.
        - Potential aliases.
        Fragments should not mention lists of character names from the TV show or movie. Provide the smallest fragment with contextual meaning.
        The response should be concise and structured in JSON format to facilitate relationship analysis in future iterations.

        An example of a valid response:
        [
            {{
                "character": "Jon Snow",
                "aliases": ["Lord Snow", "The White Wolf"],
                "fragment": "Jon Snow pledges his life to the Night's Watch and refuses to leave even when tempted."
            }},
            {{
                "character": "Daenerys Targaryen",
                "aliases": ["Dany", "Khaleesi"],
                "fragment": "Daenerys sets sail for Westeros with her armies and dragons, aiming to reclaim her family's throne."
            }}
        ]
        """
    }
    
    user_prompt = {
        "role": "user",
        "content": f"Below is part of that Wikipedia page on the TV show {title}.\n\n{text}"
    }

    combined_prompts = [prompt, user_prompt]  # This should be a list of dictionaries

    response = llm.invoke(combined_prompts)
    try:
        response_json = json.loads(response.content)
        return response_json
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        print(f"Received malformed JSON response: {response.content}")
        return []  # Return an empty list to indicate failure


def save_json_to_file(data: dict, file_path: str):
    """
    Save the JSON data to a file.
    """
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

# def process_cleaned_data(directory_path: str, title: str = None, is_file: bool = False):
#     """
#     Process all cleaned data files and extract information.
#     """
#     isProcess =True
#     if is_file:
#         title = title or os.path.basename(directory_path).split('.')[0]
#         entity_json_path = f"data/Entity_json/{title}.json"
#         if os.path.exists(entity_json_path):
#             user_input = input(f"The title '{title}' is already in Entity_json. Would you like to process it again? (yes/no): ")
#             if user_input.lower() != 'yes':
#                 print(f"Skipping processing for {title}.")
#                 return
#         content = read_files_from_directory(directory_path)
#         extracted_info = extract_information(llm, content, title)
#         save_json_to_file(extracted_info, entity_json_path)
#         print(f"Processed and saved: {entity_json_path}")
#     else:
#         # Process a directory
#         files_content = read_files_from_directory(directory_path)
#         if not files_content:
#             print("No files found in the directory.")
#             return
#         for filepath, content in files_content.items():
#             title = os.path.basename(filepath).split('_')[0]  # Derive title from filename
#             print(title)
#             entity_json_path = f"data/Entity_json/{title}"
#             print(entity_json_path)
#             if os.path.exists(entity_json_path):
                
#                 user_input = input(f"The title '{title}' is already in Entity_json. Would you like to process it again? (yes/no): ")
#                 if user_input.lower() != 'yes':
#                     print(f"Skipping processing for {title}.")
#                     continue
                 
#             # extracted_info = extract_information(llm, content, title)
#             # save_json_to_file(extracted_info, entity_json_path)
#             print(f"Processed and saved: {entity_json_path}")



def process_cleaned_data(cleaned_data_dir: str):
    """
    Process all cleaned data directories and extract information.
    """
    for title in os.listdir(cleaned_data_dir):
        title_dir_path = os.path.join(cleaned_data_dir, title)
        entity_json_dir_path = os.path.join("data/Entity_json", title)
        
        if os.path.exists(entity_json_dir_path):
            user_input = input(f"The title '{title}' is already in Entity_json. Would you like to process it again? (yes/no): ")
            if user_input.lower() != 'yes':
                print(f"Skipping processing for all files under the title '{title}'.")
                continue
            else:
                # Remove existing JSON files to replace with new ones
                for file in os.listdir(entity_json_dir_path):
                    if file.endswith('.json'):
                        os.remove(os.path.join(entity_json_dir_path, file))

        files_content = read_files_from_directory(title_dir_path)
        for filepath, content in files_content.items():
            extracted_info = extract_information(llm, content, title)
            json_filename = os.path.basename(filepath).replace(".txt", ".json")
            json_filepath = os.path.join(entity_json_dir_path, json_filename)
            save_json_to_file(extracted_info, json_filepath)
            print(f"Processed and saved: {json_filepath}")


In [None]:

# Example usage
# Process a directory
process_cleaned_data("data/cleaned_data")

In [None]:
def extract_narrative_elements(llm, text: str, title: str) -> dict:
    prompt = {
        "role": "system",
        "content": f"""
        I am analyzing the TV show '{title}' for the first time and need to extract detailed information about key narrative elements such as objects, locations, addresses, scenes, and events. Please identify these elements from the text below and provide:
        - A concise description or identification of the element.
        - A list of excerpts that discuss this element, ensuring each excerpt provides meaningful context.

        The response should be concise and structured in JSON format to facilitate analysis and visualization of these narrative elements in future iterations.

        An example of a valid response:
        [
            {{
                "element": "Shooting scene",
                "description": "Episode 3, Season 4",
                "excerpts": [
                    "The intense shooting scene in the third episode of the fourth season was pivotal to the plot development.",
                    "During the shootout, the main character's dilemma comes to a head, forcing a decision that changes the course of the story."
                ]
            }},
            {{
                "element": "Central Park",
                "description": "Location",
                "excerpts": [
                    "Several key discussions between the protagonists occur in Central Park, serving as a backdrop to their evolving relationships.",
                    "Central Park is depicted in multiple scenes as a place of reflection and confrontation among the characters."
                ]
            }}
        ]
        """
    }
    
    user_prompt = {
        "role": "user",
        "content": f"Below is part of that Wikipedia page on the TV show {title}.\n\n{text}"
    }

    combined_prompts = [prompt, user_prompt]  # This should be a list of dictionaries

    response = llm.invoke(combined_prompts)
    try:
        response_json = json.loads(response.content)
        return response_json
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        print(f"Received malformed JSON response: {response.content}")
        return []  # Return an empty list to indicate failure




def process_cleaned_data_Events(directory_path: str, title: str = None, is_file: bool = False):
    """Process all cleaned data files and extract narrative information."""
    if is_file:
        with open(directory_path, 'r', encoding='utf-8') as file:
            content = file.read()
        title = title or os.path.basename(directory_path).split('.')[0]
        extracted_info = extract_narrative_elements(llm, content, title)
        json_filename = os.path.basename(directory_path).replace(".txt", ".json")
        json_filepath = os.path.join("data/Events_json", json_filename)
        save_json_to_file(extracted_info, json_filepath)
        print(f"Processed and saved: {json_filepath}")
    else:
        files_content = read_files_from_directory(directory_path)
        if not files_content:
            print("No files found in the directory.")
            return
        for filepath, content in files_content.items():
            title = os.path.basename(filepath).split('_')[0]  # Derive title from filename
            extracted_info = extract_narrative_elements(llm, content, title)
            json_filename = os.path.basename(filepath).replace(".txt", ".json")
            json_filepath = os.path.join("data/Events_json", title, json_filename)
            save_json_to_file(extracted_info, json_filepath)
            print(f"Processed and saved: {json_filepath}")

process_cleaned_data_Events("data/cleaned_data")


In [None]:
import os
import json
from langchain_openai import ChatOpenAI

# Initialize the language model

def merge_json_files(directory_path):
    """Merge all JSON files in a directory into a single JSON list."""
    merged_data = []
    if not os.path.exists(directory_path):
        print(f"No directory found at {directory_path}")
        return merged_data

    for file in os.listdir(directory_path):
        if file.endswith('.json'):
            with open(os.path.join(directory_path, file), 'r', encoding='utf-8') as f:
                try:
                    data = json.load(f)
                    if data:  # Ensure that data is not empty
                        merged_data.extend(data)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON from file {file}: {e}")
    return merged_data

def process_title_entities(title_directory):
    """Process all entity JSON files for a given title."""
    merged_data = merge_json_files(title_directory)
    if not merged_data:
        print("No data found to merge.")
        return

    title = os.path.basename(title_directory)
    # Save the merged data
    merged_dir = os.path.join('data/Merged_Entity_json', title)
    os.makedirs(merged_dir, exist_ok=True)
    merged_file_path = os.path.join(merged_dir, f"{title}_Merged.json")
    with open(merged_file_path, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, indent=4)
    print(f"Merged data saved to {merged_file_path}")

# Example usage
title_directory = 'data/Entity_json/Interstellar (film)'  # Specify the subfolder for a specific title
process_title_entities(title_directory)


In [None]:
############################### For Deduping with LLM (have ambiguities look for beeter approch below)##############################

import json
import os
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

# Initialize the language model

def load_data(filepath):
    with open(filepath, 'r') as file:
        return json.load(file)

def save_data(data, filepath):
    with open(filepath, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

def normalize_name(name):
    return ' '.join(name.lower().strip().split())

def deduplicate_with_llm(entities):
    """Use LLM to semantically deduplicate entities based on descriptions and context."""
    deduplicated = []
    used = set()  # Track indices of entities that have been merged

    for i, current in enumerate(entities):
        if i in used:
            continue

        # Normalize and set up initial entity structure
        current['aliases'] = set([normalize_name(alias) for alias in current.get('aliases', [])] + [normalize_name(current['character'])])
        current['fragments'] = set([current['fragment']])

        for j in range(i + 1, len(entities)):
            if j in used:
                continue
            other = entities[j]

            # Normalize other entity data
            other['aliases'] = set([normalize_name(alias) for alias in other.get('aliases', [])] + [normalize_name(other['character'])])
            
            # Construct the prompt to check if entities are the same
            prompt = [
                {
                    "role": "system",
                    "content": f"Consider these entities:\n1. {current['character']} with aliases {list(current['aliases'])} and fragment: {current['fragment']}\n2. {other['character']} with aliases {list(other['aliases'])} and fragment: {other['fragment']}\nAre these descriptions of the same character?"
                }
            ]

            response = llm.invoke(prompt)  # Invoke the LLM with the prompt
            response_text = response.content  # Access the response content directly

            # Check if the response indicates the entities are the same
            if 'yes' in response_text.lower():
                current['aliases'].update(other['aliases'])
                current['fragments'].update([other['fragment']])
                used.add(j)

        # Convert sets back to lists for JSON serialization
        current['aliases'] = list(current['aliases'])
        current['fragments'] = list(current['fragments'])
        deduplicated.append(current)

    return deduplicated

# Example use of the function remains the same
# def deduplicate_with_llm(entities):
#     """ Use LLM to semantically deduplicate entities based on descriptions and context. """
#     deduplicated = []
#     used = set()  # Track indices of entities that have been merged

#     for i, current in enumerate(entities):
#         if i in used:
#             continue

#         current['aliases'] = set([normalize_name(alias) for alias in current.get('aliases', [])] + [normalize_name(current['character'])])
#         current['fragment'] = set(current.get('fragment', [current['fragment']]))

#         for j in range(i + 1, len(entities)):
#             if j in used:
#                 continue
#             other = entities[j]

#             other['aliases'] = set([normalize_name(alias) for alias in other.get('aliases', [])] + [normalize_name(other['character'])])
#             other['fragment'] = set(other.get('fragment', [other['fragment']]))

#             prompt = [
#                 {
#                     "role": "system",
#                     "content": f"""
#                     Please analyze the following character profiles:

#                     Profile 1:
#                     Name: {current['character']}
#                     Aliases: {', '.join(current['aliases'])}
#                     Key Fragments: {', '.join(current['fragment'])}

#                     Profile 2:
#                     Name: {other['character']}
#                     Aliases: {', '.join(other['aliases'])}
#                     Key Fragments: {', '.join(other['fragment'])}

#                     Considering their names, aliases, and the contexts provided by the key fragments, are these profiles describing the same individual? Evaluate their identities based on overlapping information and narrative connections.
#                     """
#                 }
#             ]

#             response = llm.invoke(prompt)  # Invoke the LLM with the prompt
#             response_text = response.content  # Access the response content directly
#             print(response_text)
#             # Check if the response indicates the entities are the same
#             if 'yes' in response_text.lower() or 'likely' in response_text.lower():
#                 current['aliases'].update(other['aliases'])
#                 current['fragment'].update(other['fragment'])
#                 used.add(j)

#         # Convert sets back to lists for JSON serialization
#         current['aliases'] = list(current['aliases'])
#         current['fragment'] = list(current['fragment'])
#         deduplicated.append(current)

#     return deduplicated
# Load data
entities = load_data('data/Merged_Entity_json/Interstellar (film)/Interstellar (film)_Merged.json')

# Deduplicate entities
deduplicated_entities = deduplicate_with_llm(entities)

# Save deduplicated data
save_data(deduplicated_entities, 'data/Deduped_Entity_json/Interstellar/Interstellar_Deduped.json')


In [38]:
############################### For Deduping without LLM ##############################
import json
from collections import defaultdict

def read_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def write_json(data, file_path):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

def merge_entries(entries):
    # Dictionary to hold merged characters
    merged_characters = {}

    for entry in entries:
        # Identify character key
        character_key = entry['character'].lower()
        
        # If the character is already added, merge the data
        if character_key in merged_characters:
            existing_entry = merged_characters[character_key]
            existing_entry['aliases'] = list(set(existing_entry['aliases'] + entry['aliases']))
            existing_entry['fragment'].append(entry['fragment'])
        else:
            # Otherwise, add new entry
            merged_characters[character_key] = {
                'character': entry['character'],
                'aliases': entry['aliases'],
                'fragment': [entry['fragment']]
            }

    # Convert merged data to list
    return list(merged_characters.values())

def process_json_file(input_file_path, output_file_path):
    # Read data from the input JSON file
    entries = read_json(input_file_path)
    
    # Merge entries
    merged_data = merge_entries(entries)
    
    # Write merged data to the output JSON file
    write_json(merged_data, output_file_path)

# Usage
input_file_path = 'data/Merged_Entity_json/Interstellar (film)/Interstellar (film)_Merged.json'
output_file_path = 'data/Deduped_Entity_json/Interstellar/Interstellar_Deduped.json'

process_json_file(input_file_path, output_file_path)


In [None]:
! pip install --upgrade openai


In [37]:
############################### For Deduping with LLM (have ambiguities look for beeter approch below)##############################
import json
from langchain_openai import ChatOpenAI

# Initialize the OpenAI model
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def query_llm(current, other, title):
    """Query the OpenAI model to determine if two character descriptions are the same in the context of a specific title."""
    prompt = [
        {
            "role": "system",
            "content": f"""
            Consider these entities from the movie or TV show '{title}':
            1. {current['character']} with aliases {list(current['aliases'])} and fragment: {current['fragment']}
            2. {other['character']} with aliases {list(other['aliases'])} and fragment: {other['fragment']}
            
            Are these descriptions of the same character? Please respond with "yes" or "no".
            """
        }
    ]

    try:
        response = llm.invoke(prompt)  # Invoke the LLM with the prompt
        response_text = response.content.strip()  # Access the response content directly
        return response_text
    except Exception as e:
        print(f"Error in querying LLM: {e}")
        return None

def merge_entries_with_context(entries, title):
    """Merge entries based on context using LLM to determine character identity."""
    merged_data = []

    for current in entries:
        current_key = current['character'].lower()
        found = False

        for merged_entry in merged_data:
            response = query_llm(current, merged_entry, title)

            if response and "yes" in response.lower():
                # If they are the same, merge aliases and fragments
                merged_entry['aliases'] = list(set(merged_entry['aliases'] + current['aliases'] + [current_key]))
                merged_entry['fragment'] += current['fragment']
                found = True
                break
        
        if not found:
            merged_data.append(current)

    return merged_data

def read_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def write_json(data, file_path):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

def process_json_file(input_file_path, output_file_path, title):
    # Read data from the input JSON file
    entries = read_json(input_file_path)
    
    # Merge entries with context understanding
    merged_data = merge_entries_with_context(entries, title)
    
    # Write merged data to the output JSON file
    write_json(merged_data, output_file_path)

# Usage example
input_file_path = 'data/Deduped_Entity_json/Interstellar/Interstellar_Deduped.json'
output_file_path = 'data/Deduped_Entity_json/Interstellar/Interstellar_Deduped_llm.json'
title = 'Interstellar'  # Example title; replace with the relevant movie or TV show title
process_json_file(input_file_path, output_file_path, title)


In [40]:
input_file_path = 'data/Deduped_Entity_json/Interstellar/Interstellar_Deduped_llm.json'
output_file_path = 'data/Deduped_Entity_json/Interstellar/Interstellar_Deduped_llm2.json'

process_json_file(input_file_path, output_file_path)

In [35]:
#Echoicing of json object in output fro Debugging purposes

from openai import OpenAI
from openai.types.chat import ChatCompletionToolParam
from openai.types.shared_params import FunctionDefinition
client = OpenAI()

# Define a function that acts as a tool for the model
function = FunctionDefinition({
    'name': 'echo_json',
    'description': 'Echoes the provided JSON object.',
    'parameters': {
        'type': 'object',
        'properties': {'json_data': {'description': 'JSON data to echo back', 'type': 'string'}},
        'required': ['json_data']
    }
})

toolc = ChatCompletionToolParam({'type': 'function', 'function': function})
print(toolc)

# Sample JSON to echo
json_input = '''
[
    {
        "character": "Dr. Mann",
        "aliases": [
            "Mann"
        ],
        "fragment": [
            "Dr. Mann, a NASA astronaut, is sent to an icy planet during the Lazarus program."
        ]
    },
    {
        "character": "Mann",
        "aliases": [],
        "fragment": [
            "The comic is a prequel to the film with Mann as the protagonist."
        ]
    }
]
'''

# Creating a chat completion request
completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant. Echo back any JSON data provided by the user.",
        },
        {
            "role": "user",
            "content": f"Please echo this JSON data: {json_input}",
        }
    ],
    model="gpt-3.5-turbo-16k",  # Make sure the model version supports your requirements
)


# Access and print the response content
print(completion.choices[0].message.content)


{'type': 'function', 'function': {'name': 'echo_json', 'description': 'Echoes the provided JSON object.', 'parameters': {'type': 'object', 'properties': {'json_data': {'description': 'JSON data to echo back', 'type': 'string'}}, 'required': ['json_data']}}}
[
    {
        "character": "Dr. Mann",
        "aliases": [
            "Mann"
        ],
        "fragment": [
            "Dr. Mann, a NASA astronaut, is sent to an icy planet during the Lazarus program."
        ]
    },
    {
        "character": "Mann",
        "aliases": [],
        "fragment": [
            "The comic is a prequel to the film with Mann as the protagonist."
        ]
    }
]


In [30]:
# Trial to achieve deduping
#  
from openai import OpenAI

client = OpenAI()

# Sample JSON data as a string with a title for context
title = "Interstellar"
json_input = '''
[
    
    {
        "character": "Dr. Mann",
        "aliases": [
            "Mann"
        ],
        "fragment": [
            "Dr. Mann, a NASA astronaut, is sent to an icy planet during the Lazarus program."
        ]
    },
    {
        "character": "Mann",
        "aliases": [],
        "fragment": [
            "Mann was in the hibernation state when cooper arrives."
        ]
    },
    {
        "character": "Murphy Cooper",
        "aliases": [],
        "fragment": [
            "Today is my birthday and it's a special one because you once told me that when you came back we might be the same age."
        ]
    },
    {
        "character": "Cooper",
        "aliases": [
            "tom cooper",
            "cooper"
            
        ],
        "fragment": [
            "Referred to only as Cooper or Coop in the film.Cooper proudly identifies himself as an engineer as well as an astronaut and farmer but he has the soul of a goofball poet.Cooper's farewell to his daughter Murph, who's played by McKenzie Foy as a young girl, is shot very close-in and lit in warm cradling tones.We've always defined ourselves by the ability to overcome the impossible.Do not go gentle into that good night.Today is my birthday and it's a special one because you once told me that when you came back we might be the same age.Love is the one thing that transcends time and space.TARS talks plenty for both of us.Everybody good? Plenty of slaves for my robot colony.Joseph Cooper, a former NASA test pilot, reluctantly becomes a farmer after the agency was closed by the government.Dr. Amelia Brand, professor Brand's daughter and NASA scientist, is responsible for conducting planet colonization.Murphy Murph Cooper, Joseph's daughter, becomes a NASA scientist working under Professor Brand.Donald, Cooper's elderly father-in-law, supports the family as farmers.Tom Cooper, Joseph's son, eventually takes charge of his father's farm.Doyle, a high-ranking NASA member and Endurance crew member, is part of the team that travels through the wormhole.TARS is an intelligent robot assigned to assist the crew of the Endurance.CASE is another intelligent robot assigned to assist the crew of the Endurance."
        ]
    },
    {
        "character": "Joseph Cooper",
        "aliases": [
            "Cooper"
        ],
        "fragment": "Joseph Cooper, a former NASA test pilot, reluctantly becomes a farmer after the agency was closed by the government."
    }
]
'''


# Creating a chat completion request
completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant skilled in understanding complex character relationships and providing guidance on merging similar characters into single entries. Given a JSON object list of characters, their aliases, and descriptions(fragments), identify duplicates and suggest how their data might be merged. The aim is to reduce duplication and create a cohesive character profile for each unique individual. Output the deduped JSON Object. Make semantic deduping."
        },

        {
            "role": "user",
            "content": f"The title of the show is '{title}'. Here is the character data: {json_input}. For now return a json onject with removed duplication, also concatinate there fragments if found similar character",
        }
    ],
    model="gpt-3.5-turbo-16k"
)

# Print the model's response
print(completion.choices[0].message.content)


{
    "characters": [
        {
            "character": "Matthew McConaughey",
            "aliases": [],
            "fragments": [
                "Matthew McConaughey stars in Interstellar as the main protagonist.",
                "The teaser trailer for Interstellar debuted December 13, 2013 and featured clips related to space exploration accompanied by a voiceover by Matthew McConaughey's character Cooper."
            ]
        },
        {
            "character": "Anne Hathaway",
            "aliases": [],
            "fragments": [
                "Anne Hathaway joins the cast of Interstellar alongside Matthew McConaughey.",
                "When Murph grows up into Jessica Chastain, a key member of Caine's NASA crew and a surrogate for the daughter that the elder Brand lost to the Endurance's mission."
            ]
        },
        {
            "character": "Jessica Chastain",
            "aliases": [],
            "fragments": [
                "Jessica Chastain appear

In [33]:
# Latest deduping usinf llm SEMANTIC DEDUPING working code
from openai import OpenAI
import re
client = OpenAI()


def read_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)

    # print(data)
    return data

# Function to write to a JSON file
def write_json(file_path, data):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)
        



def process_json_with_openai(title, json_input):
    # completion = client.chat.completions.create(

    #     messages=[
    #         {
    #             "role": "system",
    #             "content": "You are a helpful assistant skilled in understanding complex character relationships and providing guidance on merging similar characters into single entries. Given a JSON object list of characters, their aliases, and descriptions(fragments), identify duplicates and suggest how their data might be merged. The aim is to reduce duplication and create a cohesive character profile for each unique individual. Output the deduped JSON Object. Make semantic deduping."
    #         },

    #         {
    #             "role": "user",
    #             "content": f"The title of the show is '{title}'. Here is the character data: {json_input}. For now return a json onject with removed duplication, also concatinate there fragments and aliases if found sematically same",
    #         }
    #     ],
    #     model="gpt-3.5-turbo-16k"
    # )
    completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant skilled in understanding complex character relationships and providing guidance on merging similar characters into single entries. Given a JSON object list of characters, their aliases, and descriptions(fragments), identify duplicates and suggest how their data might be merged. The aim is to reduce duplication and create a cohesive character profile for each unique individual. Output the deduped JSON Object. Make semantic deduping."
        },

        {
            "role": "user",
            "content": f"The title of the show is '{title}'. Here is the character data: {json_input}. For now return a json onject with removed duplication, also concatinate there fragments if found similar character",
        }
    ],
    model="gpt-3.5-turbo-16k"
    )
    res=completion.choices[0].message.content
    # print(res)
    return res




input_file_path = 'data/Merged_Entity_json/Interstellar (film)/Interstellar (film)_Merged.json'  # Path to your input JSON file
output_file_path = 'data/Deduped_Entity_json/Interstellar/Interstellar_Desuped_llm3.json'  # Path to save the output JSON file

# Read the input JSON file
input_data = read_json(input_file_path)
title = "Interstellar"
# Process the JSON with OpenAI
output_data = process_json_with_openai(title,(input_data))
print(output_data)

json_array_match = re.search(r'\[\s*\{[\s\S]*?\}\s*\]', output_data)
if json_array_match:
    json_array_str = json_array_match.group(0)
    print("Extracted JSON Array String:\n", json_array_str)
else:
    print("No JSON array found in the response")
    json_array_str = '[]'  # Fallback to an empty list if no JSON is found

# Parse the extracted JSON array
try:
    parsed_data = json.loads(json_array_str)
    print("Parsed Data:\n", parsed_data)
except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e}")
    parsed_data = []


# Write the output JSON file
write_json(output_file_path, parsed_data)

print(f"Processed JSON has been saved to {output_file_path}")

{
  "characters": [
    {
      "character": "Matthew McConaughey",
      "aliases": [],
      "fragment": "Matthew McConaughey stars in Interstellar as the main protagonist. The teaser trailer for Interstellar debuted December 13, 2013 and featured clips related to space exploration accompanied by a voiceover by Matthew McConaughey's character Cooper."
    },
    {
      "character": "Anne Hathaway",
      "aliases": [],
      "fragment": "Anne Hathaway joins the cast of Interstellar alongside Matthew McConaughey. When Murph grows up into Jessica Chastain, a key member of Caine's NASA crew and a surrogate for the daughter that the elder Brand lost to the Endurance's mission."
    },
    {
      "character": "Jessica Chastain",
      "aliases": [],
      "fragment": "Jessica Chastain appears in Interstellar as one of the main characters."
    },
    {
      "character": "Christopher Nolan",
      "aliases": [],
      "fragment": "Christopher Nolan directs Interstellar, bringing his uni

In [7]:
%pip install --upgrade openai


Note: you may need to restart the kernel to use updated packages.
