In [11]:
from nltk.tokenize import word_tokenize

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

from nltk.corpus import stopwords

def remove_stopwords(tokens):
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

from nltk.stem import PorterStemmer

def apply_stemming(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

from nltk.stem import WordNetLemmatizer

def apply_lemmatization(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens




In [12]:
from sklearn.feature_extraction.text import CountVectorizer

def construct_bag_of_words(tokens):
    text = ' '.join(tokens)  # Convert tokens back to a text
    vectorizer = CountVectorizer()
    bow_representation = vectorizer.fit_transform([text])
    return bow_representation.toarray()[0]


def preprocess_text(text):
    tokens = tokenize_text(text)
    tokens = remove_stopwords(tokens)
    tokens = apply_stemming(tokens)
    # Alternatively, you can use lemmatization instead of stemming
    # tokens = apply_lemmatization(tokens)
    bow_representation = construct_bag_of_words(tokens)
    return bow_representation



In [13]:
standard_size = 128  # Standard size in MB

def divide_into_slices(input_text, standard_size):
    # Convert the standard size to the number of tokens based on the average token size
    standard_size_tokens = convert_mb_to_tokens(standard_size)
    
    # Tokenize the input text
    tokens = tokenize_text(input_text)
    
    # Determine the number of slices needed
    num_slices = len(tokens) // standard_size_tokens + 1
    
    # Calculate the size of each slice
    slice_size = len(tokens) // num_slices
    
    # Divide the input into slices
    slices = [tokens[i:i+slice_size] for i in range(0, len(tokens), slice_size)]
    
    return slices


def process_and_slice_input(input_text):
    standard_size = 128  # Standard size in MB
    
    # Tokenize, remove stopwords, apply stemming, and construct bag-of-words
    processed_input = preprocess_text(input_text)
    
    # If the processed input is below the standard size, pass it directly to the language model
    if len(processed_input) <= standard_size:
        return [processed_input]
    
    # Otherwise, divide the processed input into slices
    slices = divide_into_slices(processed_input, standard_size)
    
    return slices



In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_cosine_distance(slice1, slice2):
    # Convert bag-of-words representations to numpy arrays
    slice1_array = np.array(slice1).reshape(1, -1)
    slice2_array = np.array(slice2).reshape(1, -1)
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(slice1_array, slice2_array)
    
    # Cosine distance is 1 - cosine similarity
    cosine_distance = 1 - similarity_matrix[0, 0]
    
    return cosine_distance


cosine_distance_threshold = 0.2  # 20%

def check_slicing_criteria(slices):
    for i in range(len(slices) - 1):
        distance = calculate_cosine_distance(slices[i], slices[i+1])
        if distance > cosine_distance_threshold:
            # Placeholder for adjustment or use 'pass' if no action is needed
            pass

    return slices



def process_and_slice_input(input_text):
    standard_size = 128  # Standard size in MB
    
    # Tokenize, remove stopwords, apply stemming, and construct bag-of-words
    processed_input = preprocess_text(input_text)
    
    # If the processed input is below the standard size, pass it directly to the language model
    if len(processed_input) <= standard_size:
        return [processed_input]
    
    # Otherwise, divide the processed input into slices
    slices = divide_into_slices(processed_input, standard_size)
    
    # Check slicing criteria based on cosine distance
    slices = check_slicing_criteria(slices)
    
    return slices



In [None]:
def check_slicing_criteria(slices):
    new_slices = [slices[0]]  # Initialize with the first slice

    for i in range(1, len(slices)):
        current_slice = slices[i]
        previous_slice = new_slices[-1]

        # Check if the slices overlap or if one is included in the other
        if current_slice[0] >= previous_slice[-1]:
            new_slices.append(current_slice)
        else:
            # Adjust slices based on cosine distance or any other criterion
            distance = calculate_cosine_distance(previous_slice, current_slice)

            if distance > cosine_distance_threshold:
                new_slices[-1] = current_slice
            else:
                # Handle the case where slices overlap or are too similar
                # You may adjust slices, merge them, or take other actions
                # ...
                pass

    return new_slices
