In [None]:
import json
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import tiktoken
from dotenv import load_dotenv
import wikipediaapi
import requests
from bs4 import BeautifulSoup
import tiktoken
from urllib.parse import unquote
from urllib.request import urlopen
# Cell to import necessary libraries
import re
import os
import openai
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langchain_openai import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
nltk.download('stopwords')

from langchain.chains.openai_functions import (
    create_structured_output_runnable,
    create_structured_output_chain,
)


load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)


In [None]:
##################### For cleaning scrapped Dat and dividing it into chunks on the basis of tokenization ############################



def normalize_text(text):
    # Normalize whitespace and remove special characters
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()

def chunk_text(text, chunk_size=4000):
    words = word_tokenize(text)
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    # Handle the last chunk; merge with previous if too small
    if len(chunks) > 1 and len(word_tokenize(chunks[-1])) < 1000:
        chunks[-2] += ' ' + chunks.pop()
    return chunks

def save_chunks(chunks, base_dir, filename):
    os.makedirs(base_dir, exist_ok=True)
    for i, chunk in enumerate(chunks):
        chunk_filename = f"{filename}_chunk{i+1}.txt"
        chunk_path = os.path.join(base_dir, chunk_filename)
        with open(chunk_path, 'w', encoding='utf-8') as file:
            file.write(chunk)

def clean_data(scraped_dir, cleaned_dir):
    for title in os.listdir(scraped_dir):
        title_path = os.path.join(scraped_dir, title)
        cleaned_title_path = os.path.join(cleaned_dir, title)
        
        if not os.path.exists(cleaned_title_path):
            os.makedirs(cleaned_title_path, exist_ok=True)
        
        for file in os.listdir(title_path):
            file_path = os.path.join(title_path, file)
            cleaned_file_path = os.path.join(cleaned_title_path, file.replace('.txt', ''))
            
            # Check if already cleaned
            if not any(f.startswith(file.replace('.txt', '')) for f in os.listdir(cleaned_title_path)):
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                
                # Clean the text
                cleaned_text = normalize_text(text)
                chunks = chunk_text(cleaned_text)
                
                # Save cleaned chunks
                save_chunks(chunks, cleaned_title_path, file.replace('.txt', ''))
                print(f"Data cleaned and saved for {file_path}")
            else:
                print(f"Already cleaned data present for {file_path}")




In [None]:
# Example usage
scraped_dir = 'data/scraped_data'
cleaned_dir = 'data/cleaned_data'
clean_data(scraped_dir, cleaned_dir)

In [None]:
# Cell to define the text normalization function
def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
# Cell to define the function to remove unnecessary characters
def remove_unnecessary_characters(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text


In [None]:
# Cell to define the context-based cleaning function using LangChain and OpenAI API


def context_based_cleaning(input_text: str, context: str) -> str:
    """
    Clean the input text based on the given context using ChatOpenAI.

    :param input_text: The text to be cleaned.
    :param context: The context to guide the cleaning process.
    :return: The cleaned text.
    """
    def split_text(text, max_tokens):
        # Tokenize the text using tiktoken
        enc = tiktoken.get_encoding('gpt2')
        tokens = enc.encode(text)
        
        # Split the tokens into chunks of max_tokens
        chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
        
        # Decode the token chunks back to text
        text_chunks = [enc.decode(chunk) for chunk in chunks]
        
        return text_chunks

    max_tokens_per_chunk = 5000 - 385  # Leaving room for prompt tokens

    # Split the input text into manageable chunks
    text_chunks = split_text(input_text, max_tokens_per_chunk)

    cleaned_chunks = []
    for chunk in text_chunks:
        messages = [
            {
                "role": "system",
                "content": f"""You are a text cleaning assistant. Your task is to clean the input text based on the provided context.
    ## Context
    {context}
    ## Instructions
    - Remove irrelevant information.
    - Correct grammatical errors.
    - Ensure the text is clear and concise.
    - Maintain the original meaning as much as possible.
    ## Output
    Provide the cleaned version of the text in the same language as the input."""
            },
            {
                "role": "user",
                "content": f"Clean the following text based on the context:\n\n{chunk}"
            }
        ]

        response = llm.invoke(messages)
        cleaned_chunks.append(response.content)

    # Concatenate the cleaned chunks to get the full cleaned text
    cleaned_text = "\n".join(cleaned_chunks)
    
    return cleaned_text


# Example usage
# input_text = "Ths is a smple textt with somee speling mistkes and extraneous infformation dsfsdda44 54454 sfdfad.;;';."
# context = "This text is a part of a formal document and should be cleaned accordingly."

# cleaned_text = clean_text(input_text, context)

In [None]:
# Cell to define the main cleaning function
def clean_data(text):
    context = "This text is a part of a formal document and should be cleaned accordingly.You are a helpful assistant. Clean and standardize the following text to make it more readable and consistent."
    text = normalize_text(text)
    text = remove_unnecessary_characters(text)
    # text = context_based_cleaning(text,context)
    return text


In [None]:
# Cell to create directories for cleaned data based on title and source
def create_cleaned_directory_structure(base_dir, title, source):
    dir_path = os.path.join(base_dir, title, source)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    return dir_path


In [None]:
# Cell to clean the scraped data and save it in the cleaned_data directory
def clean_and_save_scraped_data(scraped_dir, cleaned_dir):
    for root, dirs, files in os.walk(scraped_dir):
        for file in files:
            if file.endswith(".txt"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = f.read()
                
                cleaned_data = clean_data(data)
                
                # Extract title and source from the file path
                relative_path = os.path.relpath(file_path, scraped_dir)
                title, source = relative_path.split(os.sep)[:2]
                
                cleaned_dir_path = create_cleaned_directory_structure(cleaned_dir, title, source)
                cleaned_file_path = os.path.join(cleaned_dir_path, file)
                
                with open(cleaned_file_path, 'w', encoding='utf-8') as f:
                    f.write(cleaned_data)
                
                print(f"Cleaned data saved to: {cleaned_file_path}")


In [None]:

#Example usage:

scraped_dir = "data/scraped_data"
cleaned_dir = "data/cleaned_data"

clean_and_save_scraped_data(scraped_dir, cleaned_dir)
