In [43]:
#importing libraries
import wikipediaapi
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#NLTK resources
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emila\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emila\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [44]:
#tokenization and preprocessing
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

def apply_stemming(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

def construct_bag_of_words(tokens):
    text = ' '.join(tokens)
    vectorizer = CountVectorizer()
    bow_representation = vectorizer.fit_transform([text])
    return bow_representation.toarray()[0]


In [45]:
#slicing input
def divide_into_slices(input_text, standard_size):
    tokens = tokenize_text(input_text)
    tokens = remove_stopwords(tokens)
    tokens = apply_stemming(tokens)
    bag_of_words = construct_bag_of_words(tokens)
    
    if len(bag_of_words) <= standard_size:
        return [bag_of_words]
    
    # otherwise, divide the processed input
    num_slices = len(bag_of_words) // standard_size + 1
    slice_size = len(bag_of_words) // num_slices
    slices = [bag_of_words[i:i+slice_size] for i in range(0, len(bag_of_words), slice_size)]
    
    return slices


In [46]:
#Cosine Distance Calculation
def calculate_cosine_distance(slice1, slice2):
    similarity_matrix = cosine_similarity(np.array(slice1).reshape(1, -1), np.array(slice2).reshape(1, -1))
    cosine_distance = 1 - similarity_matrix[0, 0]
    return cosine_distance


In [47]:
#Checking Slicing Criteria
def check_slicing_criteria(slices, cosine_distance_threshold=0.2):
    new_slices = [slices[0]] 

    for i in range(1, len(slices)):
        current_slice = slices[i]
        previous_slice = new_slices[-1]

        # checking slices overlaping
        if current_slice[0] >= previous_slice[-1]:
            new_slices.append(current_slice)
        else:
            distance = calculate_cosine_distance(previous_slice, current_slice)

            if distance > cosine_distance_threshold:
                new_slices[-1] = current_slice

    return new_slices


In [48]:
#NLP Pipeline
def nlp_pipeline(input_text, standard_size=128, cosine_distance_threshold=0.2):
    tokens = tokenize_text(input_text)
    tokens = remove_stopwords(tokens)
    tokens = apply_stemming(tokens)
    bag_of_words = construct_bag_of_words(tokens)
    
    if len(bag_of_words) <= standard_size:
        return [bag_of_words]
    
    slices = divide_into_slices(input_text, standard_size)
    slices = check_slicing_criteria(slices, cosine_distance_threshold)
    
    return slices


In [49]:
import wikipediaapi

def get_wikipedia_documents(topic, num_paragraphs=6):
    wiki_wiki = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.WIKI, headers={'User-Agent': 'emilalizada0@gmail.com'})

    page_py = wiki_wiki.page(topic)

    if not page_py.exists():
        return None

    paragraphs = []
    for section in page_py.sections:
        paragraphs.extend(section.text.split('\n')[:num_paragraphs])

    return ' '.join(paragraphs)


chemistry_topics = ["Chemistry", "Acids", "Kinetics", "Atomic", "Stoichiometry", "Electrochemistry"]
non_chemistry_topics = ["Artificial intelligence", "Computer Science", "Medical", "History", "Astronomy", "Earth"]
chemistry_documents = []
non_chemistry_documents = []

# Fetch documents for chemistry topics
for topic in chemistry_topics:
    document = get_wikipedia_documents(topic)
    if document:
        chemistry_documents.append(document)
    else:
        print(f"Could not retrieve document for {topic}")

# Fetch documents for non-chemistry topics
for topic in non_chemistry_topics:
    document = get_wikipedia_documents(topic)
    if document:
        non_chemistry_documents.append(document)
    else:
        print(f"Could not retrieve document for {topic}")

# Displaying
print("Chemistry Documents:")
for i, document in enumerate(chemistry_documents, start=1):
    print(f"Document {i} ({chemistry_topics[i-1]}):")
    print(document)
    print("\n---\n")

print("\nNon-Chemistry Documents:")
for i, document in enumerate(non_chemistry_documents, start=1):
    print(f"Document {i} ({non_chemistry_topics[i-1]}):")
    print(document)
    print("\n---\n")

# Select one document from each category for NLP pipeline
sample_input1 = chemistry_documents[0] if chemistry_documents else None
sample_input2 = non_chemistry_documents[0] if non_chemistry_documents else None

if sample_input1:
    result_slices1 = nlp_pipeline(sample_input1)
    print("Input Text (Chemistry):")
    print(sample_input1)
    print("\nResulting Slices:")
    for i, slice in enumerate(result_slices1, start=1):
        print(f"Slice {i}: {slice}")

if sample_input2:
    result_slices2 = nlp_pipeline(sample_input2)
    print("Input Text (Non-Chemistry):")
    print(sample_input2)
    print("\nResulting Slices:")
    for i, slice in enumerate(result_slices2, start=1):
        print(f"Slice {i}: {slice}")



Chemistry Documents:
Document 1 (Chemistry):
The word chemistry comes from a modification during the Renaissance of the word alchemy, which referred to an earlier set of practices that encompassed elements of chemistry, metallurgy, philosophy, astrology, astronomy, mysticism, and medicine. Alchemy is often associated with the quest to turn lead or other base metals into gold, though alchemists were also interested in many of the questions of modern chemistry.The modern word alchemy in turn is derived from the Arabic word al-kīmīā (الكیمیاء). This may have Egyptian origins since al-kīmīā is derived from the Ancient Greek χημία, which is in turn derived from the word Kemet, which is the ancient name of Egypt in the Egyptian language. Alternately, al-kīmīā may derive from χημεία 'cast together'. The current model of atomic structure is the quantum mechanical model. Traditional chemistry starts with the study of elementary particles, atoms, molecules, substances, metals, crystals and other

In [50]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_distance(slice1, slice2):
    # Ensure the input slices are NumPy arrays
    vector1 = np.array(slice1)
    vector2 = np.array(slice2)

    common_dimensions = min(len(vector1), len(vector2))
    vector1 = vector1[:common_dimensions]
    vector2 = vector2[:common_dimensions]

    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))
    cosine_distance = 1 - similarity_matrix[0, 0]

    return cosine_distance

# Testing with Different Inputs
cosine_distance_threshold = 0.2

#Chemistry
input_below_standard_size = sample_input1
slices_below_standard_size = nlp_pipeline(input_below_standard_size)

#Chemistry, Above Standard Size
input_above_standard_size = sample_input1 * 100
slices_above_standard_size = nlp_pipeline(input_above_standard_size)

# Verify bag-of-words representations for slices
print("Bag of Words for Slices Below Standard Size:")
for i, slice in enumerate(slices_below_standard_size, start=1):
    print(f"Slice {i}: {slice}")

print("\nBag of Words for Slices Above Standard Size:")
for i, slice in enumerate(slices_above_standard_size, start=1):
    print(f"Slice {i}: {slice}")

# Verify Cosine Distances for Adjacent Slices
print("\nCosine Distances for Slices Below Standard Size:")
for i in range(len(slices_below_standard_size) - 1):
    distance = calculate_cosine_distance(slices_below_standard_size[i], slices_below_standard_size[i+1])
    print(f"Distance between Slice {i+1} and Slice {i+2}: {distance}")

print("\nCosine Distances for Slices Above Standard Size:")
for i in range(len(slices_above_standard_size) - 1):
    distance = calculate_cosine_distance(slices_above_standard_size[i], slices_above_standard_size[i+1])
    print(f"Distance between Slice {i+1} and Slice {i+2}: {distance}")

print("\nTesting completed successfully!")


Bag of Words for Slices Below Standard Size:
Slice 1: [ 1  1  1  1  1  1  1  1  1  1  1  1  2  1  1  1  3  7  2  1  1  1  1  1
  1  1  2  1  1  1  1  1  1  1  1  1  3  8  1  2  1  1  1  1  1  1  1  2
  1  1  1  1  1  1  1  1  1  1 15  3 17  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  4  1  1  2  1  2  1  1  1  1  2  1  1
  2  1  1]
Slice 2: [1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 3 1 1 1 1
 1 1 3 1 1 1 1 4 1 1 1 1 1 1 2 3 3 1 1 2 2 1 1 1 1 1 3 1 1 5 4 1 1 2 3 1 1
 1 2 3 1 2 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1]
Slice 3: [3 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 4 1 2 1 2 1 1 1 2 1 1 1 1 1
 1 1 1 2 1 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 2 6 1 1 5 1 1 2 1 1 1 1 1 1 1 2 1
 1 7 1 1 3 1 1 1 1 2 1 3 2 2 2 1 1 1 1 5 4 1 1 1 1]

Bag of Words for Slices Above Standard Size:
Slice 1: [ 100  100  100  100  100  100  100  100  100  100  100  100  200  100
  100  100  300  700  200  100  100  100  100  100  100  100  200  100
    1   99  100  100  10