In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')

# Load stop words from nltk
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stop words and apply stemming
    tokens = [ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words]

    return " ".join(tokens)

def load_and_preprocess_data(directory):
    texts = []
    labels = []

    # Load and preprocess each text file
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            label = filename.split('_')[0]  # Assuming labels are in the filename, like 'label_filename.txt'
            with open(os.path.join(directory, filename), 'r') as file:
                raw_text = file.read()
                processed_text = preprocess_text(raw_text)
                texts.append(processed_text)
                labels.append(label)

    return texts, np.array(labels)

# Example usage
data_dir = "/content/10-Stories"
texts, labels = load_and_preprocess_data(data_dir)

# Applying TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(texts)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
from collections import defaultdict

def create_inverted_index(texts):
    inverted_index = defaultdict(list)

    for doc_id, text in enumerate(texts):
        words = text.split()
        for word in set(words):
            inverted_index[word].append(doc_id)

    return inverted_index

# Create inverted index
inverted_index = create_inverted_index(texts)

# Check a sample word in the inverted index
sample_word = "ant"  # Replace with any word that might be in your corpus
print(f"Inverted Index for '{sample_word}':", inverted_index.get(sample_word, "Not found"))


Inverted Index for 'ant': [0, 8]


In [40]:
from sklearn.metrics.pairwise import cosine_similarity

def vector_space_model(query, tfidf_vectorizer, X_tfidf):
    query_preprocessed = preprocess_text(query)
    query_tfidf = tfidf_vectorizer.transform([query_preprocessed])
    cosine_similarities = cosine_similarity(query_tfidf, X_tfidf).flatten()

    return cosine_similarities

# Example query
query = "ant"
query_similarities = vector_space_model(query, tfidf_vectorizer, X_tfidf)

# Retrieve top documents
top_n = 5
top_doc_indices = np.argsort(query_similarities)[-top_n:][::-1]
print("Top Documents:", top_doc_indices)


Top Documents: [0 8 9 7 6]


In [41]:
def rank_documents(query, tfidf_vectorizer, X_tfidf, top_n=5):
    similarities = vector_space_model(query, tfidf_vectorizer, X_tfidf)
    ranked_indices = np.argsort(similarities)[-top_n:][::-1]
    return ranked_indices, similarities[ranked_indices]

# Example of ranking
ranked_docs, scores = rank_documents("sample query", tfidf_vectorizer, X_tfidf)
print(f"Ranked Documents: {ranked_docs}, Scores: {scores}")


Ranked Documents: [9 8 7 6 5], Scores: [0. 0. 0. 0. 0.]


In [44]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming you have binary relevance labels (1 for relevant, 0 for non-relevant)
def evaluate_precision_recall(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    f1 = f1_score(y_true, y_pred, average='binary')

    return precision, recall, f1

# Dummy example of actual vs predicted relevance
y_true = [1, 0, 1, 0, 1]  # Actual relevance (binary)
y_pred = [1, 0, 1, 0, 0]  # Predicted relevance (binary)

precision, recall, f1 = evaluate_precision_recall(y_true, y_pred)
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")


Precision: 1.0, Recall: 0.6666666666666666, F1 Score: 0.8


In [26]:
def mean_average_precision(relevant_docs, retrieved_docs):
    score = 0.0
    num_hits = 0.0
    for i, doc in enumerate(retrieved_docs):
        if doc in relevant_docs:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not relevant_docs:
        return 0.0
    return score / len(relevant_docs)

# Example of MAP calculation
relevant_docs = [2, 5, 7]  # Ground truth relevant documents
retrieved_docs = [2, 3, 5, 6, 7]  # Retrieved document rankings

map_score = mean_average_precision(relevant_docs, retrieved_docs)
print(f"MAP Score: {map_score}")


MAP Score: 0.7555555555555555


In [27]:
def ndcg(relevances, k):
    def dcg(scores):
        return sum([rel / np.log2(idx + 2) for idx, rel in enumerate(scores)])

    ideal_relevances = sorted(relevances, reverse=True)
    return dcg(relevances[:k]) / dcg(ideal_relevances[:k])

# Example relevance scores
relevances = [3, 2, 3, 0, 1, 2]  # Example of relevance scores of retrieved docs
k = 5
print(f"nDCG: {ndcg(relevances, k)}")


nDCG: 0.8610441760375027
