In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Load documents
def load_documents(folder_path):
    documents = {}
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            documents[filename] = file.read()
    return documents

# Preprocessing: Tokenization, stopword removal, and stemming
def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Folder containing the stories
folder_path = r"C:\Users\Ripple\Downloads\10 documents"

# Load and preprocess documents
documents = load_documents(folder_path)
preprocessed_docs = {doc: preprocess(text) for doc, text in documents.items()}
preprocessed_docs


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ripple\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


{'Lazy_John.txt': 'boy name john lazi couldn’t even chang cloth one day saw appl tree yard full fruit want eat appl lazi climb tree take fruit lay underneath tree wait fruit fall john wait starv appl never fell lazi get nowher want someth need work hard',
 'The_Ant_ant_The_Grashopper.txt': 'ant grasshopp good friend summer ant work hard fill storag food grasshopp enjoy fine weather play day winter came ant lie cozili home surround food store summer grasshopp home hungri freez ask ant food ant gave wasn’t enough last entir winter tri ask ant latter repli “i’m sorri friend food enough famili last end winter give starv entir summer prepar winter chose play instead” winter stori repres time live food resourc scarc summer time everyth abund lot right save winter',
 'The_Boy_Who_Cried_Wolf.txt': 'shepherd boy like play trick one day watch herd boy decid play trick cri “wolf wolf” peopl heard rush help disappoint saw wolf boy laugh next day peopl rush aid disappoint third day boy saw wolf dev

In [14]:
# Using TF-IDF Vectorizer to create the document-term matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_docs.values())

# Document-term matrix
feature_names = vectorizer.get_feature_names_out()
print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")
feature_names

TF-IDF Matrix Shape: (10, 340)


array(['abl', 'abund', 'accept', 'act', 'aid', 'along', 'alreadi',
       'although', 'alway', 'anim', 'anoth', 'ant', 'anthil', 'anymor',
       'apolog', 'appear', 'appl', 'around', 'ask', 'attain', 'away',
       'base', 'beak', 'beauti', 'believ', 'big', 'birth', 'bite', 'bone',
       'boy', 'brim', 'bulli', 'bunch', 'busi', 'call', 'came', 'caught',
       'celebr', 'chagrin', 'challeng', 'chanc', 'chang', 'cheat',
       'chose', 'climb', 'cloth', 'come', 'consist', 'constantli',
       'could', 'couldn', 'coupl', 'courag', 'cozili', 'cri', 'cross',
       'crow', 'cruelli', 'crush', 'day', 'death', 'decid', 'deed',
       'definit', 'den', 'devour', 'didn', 'differ', 'disappoint',
       'distanc', 'doesn', 'dog', 'don', 'drank', 'drink', 'drool',
       'duck', 'duckl', 'easili', 'eat', 'eleph', 'elephant', 'empti',
       'end', 'enjoy', 'enough', 'entir', 'envi', 'escap', 'even',
       'everi', 'everyon', 'everyth', 'everywher', 'excus', 'fairi',
       'fall', 'famili', 'f

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

def process_query(query):
    # Preprocess the query similar to the documents
    query = preprocess(query)
    # Transform query using the same vectorizer
    query_vec = vectorizer.transform([query])
    return query_vec

def retrieve_documents(query, top_n=5):
    query_vec = process_query(query)
    # Compute cosine similarity between query and all documents
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()
    # Get the top N results
    top_indices = cosine_sim.argsort()[-top_n:][::-1]
    top_docs = [(list(preprocessed_docs.keys())[index], cosine_sim[index]) for index in top_indices]
    return top_docs

# Test the retrieval system
query = "example search terms"
top_results = retrieve_documents(query)

print("Top Results:")
for doc, score in top_results:
    print(f"{doc}: {score}")


Top Results:
The_Thirsty_Crow.txt: 0.11295368084165004
The_Dog_and_the_Bone.txt: 0.10189724132085222
The_Ugly_Duckling.txt: 0.0
The_Lion_and_the_Poor_Slave.txt: 0.0
The_Hare_and_the_Tortoise.txt: 0.0


In [5]:
# Query1
query = "adventure story"
top_results = retrieve_documents(query)

print("Top Results:")
for doc, score in top_results:
    print(f"{doc}: {score}")

Top Results:
The_Ugly_Duckling.txt: 0.1288712355441447
The_Hare_and_the_Tortoise.txt: 0.06973880733521144
The_Ant_ant_The_Grashopper.txt: 0.058404671882633985
The_Thirsty_Crow.txt: 0.0
The_Lion_and_the_Poor_Slave.txt: 0.0


In [6]:
# Query2
query = "hard work"
top_results = retrieve_documents(query)

print("Top Results:")
for doc, score in top_results:
    print(f"{doc}: {score}")

Top Results:
The_Fox_and_the_Grapes.txt: 0.20901447963550443
Lazy_John.txt: 0.12288220333660704
The_Ant_ant_The_Grashopper.txt: 0.07814991229071706
The_Thirsty_Crow.txt: 0.05837708605274318
The_Ugly_Duckling.txt: 0.0


In [12]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, average_precision_score
import numpy as np

# Function to simulate the retrieval process for each query
def simulate_query_retrieval(query):
    # Dummy true labels (1 = relevant, 0 = not relevant)
    # Replace these with actual results from your system
    if query == "hard work":
        return [1, 1, 0], [1, 0, 1]  # true_labels, predicted_labels
    elif query == "perseverance":
        return [1, 0, 1], [1, 1, 0]  # true_labels, predicted_labels
    elif query == "adventure story":
        return [1, 0, 1], [1, 1, 0]  # true_labels, predicted_labels

# Function to calculate precision, recall, and accuracy
def calculate_metrics(true_labels, predicted_labels):
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    accuracy = accuracy_score(true_labels, predicted_labels)
    return precision, recall, accuracy

# Function to calculate MAP and nDCG
def calculate_map_and_ndcg(true_labels, predictions):
    map_score = average_precision_score(true_labels, predictions)
    
    # Define relevance scores for nDCG calculation
    relevances = [3, 0, 2]  # Example relevance scores for the top 3 retrieved documents
    ndcg = ndcg_at_k(relevances, 3)
    
    return map_score, ndcg

# nDCG helper functions
def dcg_at_k(relevances, k):
    relevances = np.asfarray(relevances)[:k]
    if relevances.size:
        return np.sum((2 ** relevances - 1) / np.log2(np.arange(2, relevances.size + 2)))
    return 0.

def ndcg_at_k(relevances, k):
    dcg_max = dcg_at_k(sorted(relevances, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(relevances, k) / dcg_max

# List of queries
queries = ["hard work", "perseverance", "adventure story"]

# Evaluating each query
for query in queries:
    true_labels, predicted_labels = simulate_query_retrieval(query)
    precision, recall, accuracy = calculate_metrics(true_labels, predicted_labels)
    
    # Example predictions for MAP calculation (dummy similarity scores)
    predictions = [0.9, 0.2, 0.8] if query == "hard work" else [0.7, 0.1, 0.6]
    
    map_score, ndcg = calculate_map_and_ndcg(true_labels, predictions)
    
    # Output results
    print(f"Results for Query: '{query}'")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"MAP: {map_score:.2f}")
    print(f"nDCG: {ndcg:.2f}")
    print("\n" + "-"*30 + "\n")


Results for Query: 'hard work'
Precision: 0.50
Recall: 0.50
Accuracy: 0.33
MAP: 0.83
nDCG: 0.96

------------------------------

Results for Query: 'perseverance'
Precision: 0.50
Recall: 0.50
Accuracy: 0.33
MAP: 1.00
nDCG: 0.96

------------------------------

Results for Query: 'adventure story'
Precision: 0.50
Recall: 0.50
Accuracy: 0.33
MAP: 1.00
nDCG: 0.96

------------------------------



In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your documents (assumed to be preprocessed and stored)
documents = [
    'boy name john lazi couldn’t even chang cloth one day saw appl tree yard full fruit want eat appl lazi climb tree take fruit lay underneath tree wait fruit fall john wait starv appl never fell lazi get nowher want someth need work hard',
    'ant grasshopp good friend summer ant work hard fill storag food grasshopp enjoy fine weather play day winter came ant lie cozili home surround food store summer grasshopp home hungri freez ask ant food ant gave wasn’t enough last entir winter tri ask ant latter repli “i’m sorri friend food enough famili last end winter give starv entir summer prepar winter chose play instead” winter stori repres time live food resourc scarc summer time everyth abund lot right save winter',
    'shepherd boy like play trick one day watch herd boy decid play trick cri “wolf wolf” peopl heard rush help disappoint saw wolf boy laugh next day peopl rush aid disappoint third day boy saw wolf devour one sheep cri help peopl heard thought anoth boy’ prank one came help day boy lost sheep wolf alway lie cheat peopl come time one believ anymor',
    'dog wander street night day search food one day found big juici bone immedi grab mouth took home way home cross river saw anoth dog bone mouth want bone open mouth bone bite fell river sank night went home hungri alway envi other we’ll lose alreadi like greedi dog',
    'proud eleph constantli bulli smaller anim would go anthil near home spray water ant ant size could noth cri eleph laugh threaten ant would crush death one day ant enough decid teach eleph lesson went straight elephant’ trunk start bite eleph could howl pain realiz mistak apolog ant anim bulli humbl treat everyon kind think you’r stronger other use strength protect instead harm',
    'hungri fox stumbl upon vineyard see round juici grape hang bunch fox drool matter high jump couldn’t reach told probabl sour left night sleep empti stomach us tend act like fox want someth think it’ hard attain make excus tell it’ probabl great instead work hard',
    'hare friend tortois one day challeng tortois race see slow tortois go hare thought he’d win easili took nap tortois kept go hare woke saw tortois alreadi finish line much chagrin tortois race busi sleep coupl moral lesson learn stori hare teach overconfid sometim ruin tortois teach us power persever even odd stack never give sometim life who’ fastest strongest it’ consist',
    'slave treat cruelli master one day couldn’t take anymor ran forest escap chanc upon lion couldn’t walk thorn paw although scare slave muster courag took thorn lion’ paw lion final free thorn ran forest didn’t harm slave sometim later slave caught master along anim forest master order slave thrown lion’ den slave saw lion recogn lion help forest slave abl escap den unharm freed anim good alway way return good deed kind other world kind',
    'fli long distanc thirsti crow wander forest search water final saw pot halffil water tri drink beak wasn’t long enough reach water insid saw pebbl ground one one put pot water rose brim crow hastili drank quench thirst there’ there’ way everi problem solut look hard don’t give',
    'us probabl heard stori one famou fairi tale world stori revolv around duckl moment birth alway felt differ sibl alway pick didn’t look like rest one day enough ran away pond grew wander nearbi look famili would accept month pass season chang everywher went nobodi want ugli duck one day came upon famili swan upon look realiz month spent look famili call grown beauti swan final understood never look like rest sibl wasn’t duck swan shouldn’t quick judg other base physic appear someon doesn’t fit societ definit beauti doesn’t mean they’r ugli us beauti uniqu way it’ time accept celebr individu'
    # Add the other short stories here...
]

# Function to retrieve documents based on a query
def retrieve_documents(query, documents):
    vectorizer = TfidfVectorizer()
    # Combine the query and documents for TF-IDF vectorization
    tfidf_matrix = vectorizer.fit_transform([query] + documents)
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    
    # Create a list of tuples with document index and similarity score
    ranked_documents = list(enumerate(cosine_similarities))
    
    # Sort documents based on similarity scores in descending order
    ranked_documents.sort(key=lambda x: x[1], reverse=True)
    
    return ranked_documents

# Example query
query = "hard work"
ranked_docs = retrieve_documents(query, documents)

# Output the ranked documents and their similarity scores
for index, score in ranked_docs:
    print(f"Document {index + 1}: Score = {score:.4f} - {documents[index]}")


Document 6: Score = 0.1914 - hungri fox stumbl upon vineyard see round juici grape hang bunch fox drool matter high jump couldn’t reach told probabl sour left night sleep empti stomach us tend act like fox want someth think it’ hard attain make excus tell it’ probabl great instead work hard
Document 1: Score = 0.1117 - boy name john lazi couldn’t even chang cloth one day saw appl tree yard full fruit want eat appl lazi climb tree take fruit lay underneath tree wait fruit fall john wait starv appl never fell lazi get nowher want someth need work hard
Document 2: Score = 0.0709 - ant grasshopp good friend summer ant work hard fill storag food grasshopp enjoy fine weather play day winter came ant lie cozili home surround food store summer grasshopp home hungri freez ask ant food ant gave wasn’t enough last entir winter tri ask ant latter repli “i’m sorri friend food enough famili last end winter give starv entir summer prepar winter chose play instead” winter stori repres time live food r