In [None]:
#importing libraries
import wikipediaapi
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#NLTK resources
nltk.download('stopwords')
nltk.download('punkt')


In [87]:
#tokenization and preprocessing
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

def apply_stemming(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

def construct_bag_of_words(tokens):
    text = ' '.join(tokens)
    vectorizer = CountVectorizer()
    bow_representation = vectorizer.fit_transform([text])
    return bow_representation.toarray()[0]


In [88]:
#slicing input
def divide_into_slices(input_text, standard_size):
    tokens = tokenize_text(input_text)
    tokens = remove_stopwords(tokens)
    tokens = apply_stemming(tokens)
    bag_of_words = construct_bag_of_words(tokens)
    
    if len(bag_of_words) <= standard_size:
        return [bag_of_words]
    
    # otherwise, divide the processed input
    num_slices = len(bag_of_words) // standard_size + 1
    slice_size = len(bag_of_words) // num_slices
    slices = [bag_of_words[i:i+slice_size] for i in range(0, len(bag_of_words), slice_size)]
    
    return slices


In [89]:
#Cosine Distance Calculation
def calculate_cosine_distance(slice1, slice2):
    similarity_matrix = cosine_similarity(np.array(slice1).reshape(1, -1), np.array(slice2).reshape(1, -1))
    cosine_distance = 1 - similarity_matrix[0, 0]
    return cosine_distance


In [90]:
#Checking Slicing Criteria
def check_slicing_criteria(slices, cosine_distance_threshold=0.2):
    new_slices = [slices[0]] 

    for i in range(1, len(slices)):
        current_slice = slices[i]
        previous_slice = new_slices[-1]

        # checking slices overlaping
        if current_slice[0] >= previous_slice[-1]:
            new_slices.append(current_slice)
        else:
            distance = calculate_cosine_distance(previous_slice, current_slice)

            if distance > cosine_distance_threshold:
                new_slices[-1] = current_slice

    return new_slices


In [91]:
#NLP Pipeline
def nlp_pipeline(input_text, standard_size=128, cosine_distance_threshold=0.2):
    tokens = tokenize_text(input_text)
    tokens = remove_stopwords(tokens)
    tokens = apply_stemming(tokens)
    bag_of_words = construct_bag_of_words(tokens)
    
    if len(bag_of_words) <= standard_size:
        return [bag_of_words]
    
    slices = divide_into_slices(input_text, standard_size)
    slices = check_slicing_criteria(slices, cosine_distance_threshold)
    
    return slices


In [None]:
import wikipediaapi

def get_wikipedia_documents(topic, num_paragraphs=3):
    wiki_wiki = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.WIKI, headers={'User-Agent': 'emilalizada0@gmail.com'})

    page_py = wiki_wiki.page(topic)

    if not page_py.exists():
        return None

    paragraphs = []
    for section in page_py.sections:
        paragraphs.extend(section.text.split('\n')[:num_paragraphs])

    return ' '.join(paragraphs)


geographical_topics = ["Geography", "Sea", "Ocean", "Longitude", "Meteorology", "Climate"]
non_geographical_topics = ["Artificial intelligence", "Computer Science", "Medical", "History"]
geographical_documents = []
non_geographical_documents = []

# Fetch documents for geographical topics
for topic in geographical_topics:
    document = get_wikipedia_documents(topic)
    if document:
        geographical_documents.append(document)
    else:
        print(f"Could not retrieve document for {topic}")

# Fetch documents for non-geographical topics
for topic in non_geographical_topics:
    document = get_wikipedia_documents(topic)
    if document:
        non_geographical_documents.append(document)
    else:
        print(f"Could not retrieve document for {topic}")

# Display the obtained documents
print("Geographical Documents:")
for i, document in enumerate(geographical_documents, start=1):
    print(f"Document {i} ({geographical_topics[i-1]}):")
    print(document)
    print("\n---\n")

print("\nNon-Geographical Documents:")
for i, document in enumerate(non_geographical_documents, start=1):
    print(f"Document {i} ({non_geographical_topics[i-1]}):")
    print(document)
    print("\n---\n")

# Select one document from each category for NLP pipeline
sample_input1 = geographical_documents[0] if geographical_documents else None
sample_input2 = non_geographical_documents[0] if non_geographical_documents else None

if sample_input1:
    # Using the NLP pipeline for the first geographical document
    result_slices1 = nlp_pipeline(sample_input1)
    print("Input Text (Geographical):")
    print(sample_input1)
    print("\nResulting Slices:")
    for i, slice in enumerate(result_slices1, start=1):
        print(f"Slice {i}: {slice}")

if sample_input2:
    result_slices2 = nlp_pipeline(sample_input2)
    print("Input Text (Non-Geographical):")
    print(sample_input2)
    print("\nResulting Slices:")
    for i, slice in enumerate(result_slices2, start=1):
        print(f"Slice {i}: {slice}")


In [None]:
# Testing with Different Inputs
cosine_distance_threshold = 0.2

input_below_standard_size = sample_input1
slices_below_standard_size = nlp_pipeline(input_below_standard_size)

input_above_standard_size = sample_input1 * 100
slices_above_standard_size = nlp_pipeline(input_above_standard_size)

#verify bag-of-words representations for slices
print("Bag of Words for Slices Below Standard Size:")
for i, slice in enumerate(slices_below_standard_size):
    print(f"Slice {i}: {slice}")

print("\nBag of Words for Slices Above Standard Size:")
for i, slice in enumerate(slices_above_standard_size):
    print(f"Slice {i}: {slice}")

print("\nTesting completed successfully!")
