In [None]:

import json
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import tiktoken
from dotenv import load_dotenv
import wikipediaapi
import requests
from bs4 import BeautifulSoup
import tiktoken
from urllib.parse import unquote
from urllib.request import urlopen
# Cell to import necessary libraries
import re
import os
import openai
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langchain_openai import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
nltk.download('stopwords')

from langchain.chains.openai_functions import (
    create_structured_output_runnable,
    create_structured_output_chain,
)


load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')




In [None]:
#################### Updated tool for Scrapping ############################

def fetch_page(url):
    response = requests.get(url)
    return response.text

def parse_page(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    
    # Extract the title for the directory name
    title_tag = soup.find('h1')
    title_text = title_tag.text.replace('/', '_')  # Replace '/' in titles with '_' to avoid path issues
    
    # Extract all text from the main content
    content = soup.find('div', {'id': 'mw-content-text'})
    text = content.get_text() if content else ""
    
    # Find links within the content that include the main title
    links = [a['href'] for a in soup.find_all('a', href=True) if a.text and title_text in a.text]
    
    # Convert relative links to absolute
    links = ['https://en.wikipedia.org' + link if link.startswith('/wiki/') else link for link in links]
    
    return text, links, title_text
    

def scrape_wikipedia(start_url):
    html = fetch_page(start_url)
    text, links, title = parse_page(html, start_url)
    
    # For simplicity, just fetch text from the first few relevant links
    for link in links[:10]:  # Limit to first 10 links to avoid too many requests
        html = fetch_page(link)
        page_text, _ , _ = parse_page(html, link)  # We don't follow further links here
        text += "\n\n" + page_text

    base_dir = os.path.join('data/scraped_data', title)
    os.makedirs(base_dir, exist_ok=True)  # Create directory if it does not exist
    
    # Format current datetime for the filename
    now = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    filename = f"{title}_{now}_Wikipedia.txt"
    filepath = os.path.join(base_dir, filename)
    
    # Save the content to a text file
    with open(filepath, 'w', encoding='utf-8') as file:
        file.write(text)

    return text, title




In [None]:
# Example usage
url = 'https://en.wikipedia.org/wiki/Interstellar_(film)'
collected_text, title = scrape_wikipedia(url)
print(len(collected_text))
print(f"Data collected for {title} and saved to the respective directory.")

In [None]:
#########################Scrape further links for Fetched Entities of Chatacters like acotes, director etc ##################

def read_all_json_files(directory_path):
    """Read all JSON files in the given directory and extract unique character names and aliases."""
    names = set()
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
            for item in data:
                names.add(item['character'])
                names.update(item.get('aliases', []))
    return names

def find_relevant_links(soup, names):
    """Find and return links that are relevant based on the names provided."""
    relevant_links = []
    for link in soup.find_all('a', href=True, title=True):
        if any(name in link['title'] for name in names):
            relevant_links.append(link['href'])
    return relevant_links

def scrape_relevant_content(start_url, root_json_dir):
    """Scrape content from Wikipedia based on character names extracted from JSON files in a specific directory."""
    title = unquote(start_url.split('/')[-1]).replace('_', ' ')
    json_dir_path = os.path.join(root_json_dir, title)
    
    if not os.path.exists(json_dir_path):
        print(f"No JSON directory found for title: {title}")
        return
    
    names = read_all_json_files(json_dir_path)
    response = requests.get(start_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    relevant_links = find_relevant_links(soup, names)
    base_url = 'https://en.wikipedia.org'

    for href in relevant_links:
        full_url = base_url + href
        response = requests.get(full_url)
        page_soup = BeautifulSoup(response.text, 'html.parser')
        page_title = unquote(href.split('/')[-1]).replace('_', ' ')
        dir_path = os.path.join('data/targeted_scraped_data', page_title)
        os.makedirs(dir_path, exist_ok=True)
        file_path = os.path.join(dir_path, f"{page_title}.txt")

        # Extract and save only the textual content from the article
        text_content = page_soup.find('div', id='mw-content-text').get_text(separator='\n', strip=True)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(text_content)
        print(f"Saved content related to {page_title} to {file_path}")



In [None]:
# Example usage
scrape_relevant_content('https://en.wikipedia.org/wiki/Interstellar_(film)', 'data/Entity_json')


In [None]:
#########################Scrape further links present in the provided wiki Page (may stuck into recurssion loop) ##################

def scrape_wikipedia_content(url, depth=0, max_depth=1, iteration=1, max_iterations=20, visited=None):
    if visited is None:
        visited = set()  # Initialize the set of visited URLs
    
    if depth > max_depth or iteration > max_iterations:
        print(f"Stopping recursion at depth {depth} and iteration {iteration}")
        return  # Stop recursion based on depth and iteration limits

    if url in visited:
        print(f"Already visited {url}")
        return  # Avoid re-scraping the same URL
    visited.add(url)

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    title = soup.find(id='firstHeading').text.replace('/', '_')  # Sanitize title for file path
    dir_path = os.path.join('data/rough_scraped_data', title)
    os.makedirs(dir_path, exist_ok=True)

    content = soup.find(id='mw-content-text')
    if content:
        content_text = content.get_text()
        file_path = os.path.join(dir_path, f"{title}_Wikipedia_chunk{iteration}.txt")
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content_text)
        print(f"Saved content to {file_path}")

    # Recurse only if under max iterations
    if iteration < max_iterations:
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('/wiki/') and not ':' in href:
                full_url = f'https://en.wikipedia.org{href}'
                if full_url not in visited:
                    print(f"Recursing into {full_url} at iteration {iteration+1}")
                    scrape_wikipedia_content(full_url, depth + 1, max_depth, iteration + 1, max_iterations, visited)




In [None]:
# Example usage
scrape_wikipedia_content('https://en.wikipedia.org/wiki/Interstellar_(film)')

In [None]:
#################### For scraping using wikipedia api but currently not in use (problem not fetch all data) ############################
# Main function to scrape and save data

# General function to scrape data from any URL
def scrape_general(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    data = soup.get_text()
    return data

# Function to create directories based on title and source
def create_directory_structure(base_dir, title, source):
    dir_path = os.path.join(base_dir, title, source)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    return dir_path


# Function to save data to a file with structured naming
def save_data(data, dir_path, title, source):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    file_name = f"{title}_{source}_{timestamp}.txt"
    file_path = os.path.join(dir_path, file_name)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(data)
    return file_path


def scrape_and_save(title, url=None):
    base_dir = "scraped_data"
    if url:
        source = url.split("//")[-1].split("/")[0]
        data = scrape_general(url)
    else:
        source = "wikipedia"
        data = scrape_wikipedia(title)
    
    dir_path = create_directory_structure(base_dir, title, source)
    file_path = save_data(data, dir_path, title, source)
    return file_path


In [None]:
# Example Usage

title = "Interstellar_(film)"
wikipedia_url = "https://en.wikipedia.org/wiki/Interstellar_(film)"
# fandom_url = "https://interstellarfilm.fandom.com/wiki/Interstellar_Wiki"

# Scrape and save data from Wikipedia
wiki_file_path = scrape_and_save(title)
print(f"Data saved to: {wiki_file_path}")

# # Scrape and save data from Fandom
# fandom_file_path = scrape_and_save(title, fandom_url)
# print(f"Data saved to: {fandom_file_path}")