In [8]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Function to clean and preprocess text
def clean_and_preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize the text
    stop_words = set(stopwords.words('english'))  # Get English stopwords
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    cleaned_text = ' '.join(tokens)  # Join tokens back into text
    cleaned_text
    return cleaned_text

# Read the CSV file
df = pd.read_csv("sampled.csv")

# Clean and preprocess the 'TITLE' and 'SECTION_TEXT' columns
df['TITLE'] = df['TITLE'].apply(clean_and_preprocess_text)
df['SECTION_TEXT'] = df['SECTION_TEXT'].apply(clean_and_preprocess_text)

# Concatenate 'TITLE' and 'SECTION_TEXT' columns to create 'ARTICLE_TEXT'
df['ARTICLE_TEXT'] = df['TITLE'] + " " + df['SECTION_TEXT']

# Save the cleaned data to a new CSV file
df.to_csv("cleaned_data.csv", index=False)


In [9]:
import pandas as pd
from nltk.tokenize import word_tokenize
from collections import Counter

# Read the cleaned data
df_cleaned = pd.read_csv("cleaned_data.csv")

# Function to calculate term frequency
def calculate_term_frequency(text):
    word_counts = Counter(text.split())
    return word_counts

# Calculate term frequency for each document
df_cleaned['TERM_FREQUENCY'] = df_cleaned['ARTICLE_TEXT'].apply(calculate_term_frequency)

# Function to calculate document frequency
def calculate_document_frequency(df):
    document_frequency = Counter()
    for index, row in df.iterrows():
        document_frequency.update(row['TERM_FREQUENCY'].keys())
    return document_frequency

# Calculate document frequency
document_frequency = calculate_document_frequency(df_cleaned)

# Function to calculate inverse document frequency
def calculate_inverse_document_frequency(total_documents, df):
    inverse_document_frequency = {}
    for term, freq in df.items():
        idf = 1 + log(total_documents / (freq + 1))
        inverse_document_frequency[term] = idf
    return inverse_document_frequency

# Calculate total number of documents
total_documents = len(df_cleaned)
# Calculate inverse document frequency
inverse_document_frequency = calculate_inverse_document_frequency(total_documents, document_frequency)


In [14]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
from math import log, sqrt

# Function to clean and preprocess text
def clean_and_preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize the text
    stop_words = set(stopwords.words('english'))  # Get English stopwords
    lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Lemmatize and remove stopwords
    cleaned_text = ' '.join(tokens)  # Join tokens back into text
    return cleaned_text

# Read the CSV file and clean data
df = pd.read_csv("sampled.csv")
df['TITLE'] = df['TITLE'].apply(clean_and_preprocess_text)
df['SECTION_TEXT'] = df['SECTION_TEXT'].apply(clean_and_preprocess_text)
df['ARTICLE_TEXT'] = df['TITLE'] + " " + df['SECTION_TEXT']

# Preprocessing
df_cleaned = df.copy()

# Function to calculate term frequency
def calculate_term_frequency(text):
    word_counts = Counter(text.split())
    return word_counts

# Calculate term frequency for each document
df_cleaned['TERM_FREQUENCY'] = df_cleaned['ARTICLE_TEXT'].apply(calculate_term_frequency)

# Function to calculate document frequency
def calculate_document_frequency(df):
    document_frequency = Counter()
    for index, row in df.iterrows():
        document_frequency.update(row['TERM_FREQUENCY'].keys())
    return document_frequency

# Calculate document frequency
document_frequency = calculate_document_frequency(df_cleaned)

# Function to calculate inverse document frequency
def calculate_inverse_document_frequency(total_documents, df):
    inverse_document_frequency = {}
    for term, freq in df.items():
        idf = log(total_documents / (freq + 1))  # Add 1 to avoid division by zero
        inverse_document_frequency[term] = idf
    return inverse_document_frequency

# Calculate total number of documents
total_documents = len(df_cleaned)
# Calculate inverse document frequency
inverse_document_frequency = calculate_inverse_document_frequency(total_documents, document_frequency)

# Indexing Engine
class IndexingEngine:
    def __init__(self, documents):
        self.documents = documents
    
    def break_down_documents(self):
        words = []
        for doc in self.documents:
            words.extend(doc.split())
        return words

    def create_word_ids(self):
        words = self.break_down_documents()
        unique_words = list(set(words))
        word_ids = {word: idx for idx, word in enumerate(unique_words)}
        return word_ids

    def create_vocabulary(self):
        words = self.break_down_documents()
        vocabulary = list(set(words))
        return vocabulary

    def count_word_frequency(self):
        word_frequency = {}
        for doc_id, doc in enumerate(self.documents):
            word_frequency[doc_id] = Counter(doc.split())
        return word_frequency

indexing_engine = IndexingEngine(df_cleaned['ARTICLE_TEXT'])

# Break Down Documents
words = indexing_engine.break_down_documents()

# Create Word IDs
word_ids = indexing_engine.create_word_ids()

# Create Vocabulary
vocabulary = indexing_engine.create_vocabulary()

# Count Word Frequency
word_frequency = indexing_engine.count_word_frequency()

# User Submits Query
query = input("Enter your query: ")

# Query Vectorizer
class QueryVectorizer:
    def __init__(self, query, word_ids):
        self.query = query
        self.word_ids = word_ids

    def vectorize_query(self):
        query_vector = [0] * len(self.word_ids)
        words = self.query.split()
        for word in words:
            if word in self.word_ids:
                idx = self.word_ids[word]
                query_vector[idx] += 1
        return query_vector

query_vectorizer = QueryVectorizer(query, word_ids)
vectorized_query = query_vectorizer.vectorize_query()

# Ranker Engine
class RankerEngine:
    def __init__(self, vectorized_query, term_frequency, inverse_document_frequency):
        self.vectorized_query = vectorized_query
        self.term_frequency = term_frequency
        self.inverse_document_frequency = inverse_document_frequency
        self.scores = self.calculate_scores()
    
    def calculate_scores(self):
        scores = {}
        for doc_id, doc in self.term_frequency.items():
            score = 0
            doc_length = sum(doc.values())
            for word_id, query_weight in enumerate(self.vectorized_query):
                if word_id in doc:
                    doc_weight = doc[word_id]
                    score += (query_weight * doc_weight / doc_length) * self.inverse_document_frequency[word_id]
            scores[doc_id] = score / sqrt(sum(map(lambda x: x*x, self.vectorized_query)))  # Normalize by query length
        return scores

ranker_engine = RankerEngine(vectorized_query, word_frequency, inverse_document_frequency)
document_scores = ranker_engine.scores

# Display document scores
for doc_id, score in document_scores.items():
    print(f"Document ID: {doc_id}, Relevance Score: {score}, Content: {df_cleaned.iloc[doc_id]['ARTICLE_TEXT']}")


Document ID: 0, Relevance Score: 0.0, Content: delavan illinois delavan founded group settler new england city derives name edward c delavan temperance advocate albany new york post office operation delavan since 1840
Document ID: 1, Relevance Score: 0.0, Content: silver creek township lake county minnesota silver creek township township lake county minnesota united state population 1178 2000 census minnesota state highway 61 serf main route township silver creek township organized 1905
Document ID: 2, Relevance Score: 0.0, Content: holbrook new york 
Document ID: 3, Relevance Score: 0.0, Content: violent femmes album violent femmes adobe flash radio3net streamed copy licensed
Document ID: 4, Relevance Score: 0.0, Content: kragerø town kragerø established municipality 1 january 1838 see formannskapsdistrikt day sailing ship kragerø one norway largest port city rural municipality sannidal skåtøy merged municipality kragerø 1 january 1960 municipality includes 495 island islet skerries a

In [5]:
import csv
import string
from collections import defaultdict
import math

# Step 1: Data Cleaning
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    return text

# Step 2: Word Counting
def count_words(documents):
    word_counts = defaultdict(int)
    for document in documents:
        words = document.split()
        for word in words:
            word_counts[word] += 1
    return word_counts

# Step 3: Word Indexing
def create_word_index(word_counts):
    word_index = {}
    index = 0
    for word, count in word_counts.items():
        word_index[word] = {'id': index, 'count': count}
        index += 1
    return word_index

# Step 4: Vocabulary Creation
def create_vocabulary(word_index):
    vocabulary = list(word_index.keys())
    return vocabulary

# Step 5: Normalization
def normalize_word_counts(word_counts):
    total_count = sum(word_counts.values())
    normalized_counts = {word: count / total_count for word, count in word_counts.items()}
    return normalized_counts

# Step 6: Query Handling
def process_query(query, word_index):
    query_vector = defaultdict(int)
    query_words = query.split()
    for word in query_words:
        if word in word_index:
            query_vector[word] += 1
    return query_vector

# Step 7: Query Vectorization (Simple Bag-of-Words Model)
def vectorize_query(query, vocabulary):
    query_vector = [0] * len(vocabulary)
    query_words = query.split()
    for word in query_words:
        if word in vocabulary:
            index = vocabulary.index(word)
            query_vector[index] += 1
    return query_vector

# Load and process data from sampled.csv
def process_data(filename):
    cleaned_documents = []
    document_ids = []
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        for i, row in enumerate(reader):
            cleaned_doc = clean_text(row[0])
            cleaned_documents.append(cleaned_doc)
            document_ids.append(i)  # Assigning a unique ID to each document
    return cleaned_documents, document_ids

def rank_documents(query_vector_bow, document_vectors):
    # Compute cosine similarity between query vector and document vectors
    similarities = []
    for doc_vector in document_vectors:
        dot_product = sum(a * b for a, b in zip(query_vector_bow, doc_vector))
        query_norm = math.sqrt(sum(a**2 for a in query_vector_bow))
        doc_norm = math.sqrt(sum(a**2 for a in doc_vector))
        
        # Check for zero magnitude vectors to avoid division by zero
        if query_norm == 0 or doc_norm == 0:
            similarity = 0  # Assign zero similarity
        else:
            similarity = dot_product / (query_norm * doc_norm)
        
        similarities.append(similarity)
    
    # Sort document IDs based on similarity scores
    ranked_document_ids = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)
    return ranked_document_ids

# Document Identification
def identify_documents(document_ids, ranked_document_ids, top_k=5):
    # Return top k documents based on ranking
    top_documents = [document_ids[i] for i in ranked_document_ids[:top_k]]
    return top_documents

# User Response
def provide_user_response(top_documents):
    print("\nTop Documents:")
    for doc_id in top_documents:
        print(f"Document ID: {doc_id}")

# Main function to execute steps sequentially
def main():
    # Process data
    documents, document_ids = process_data('sampled.csv')
    
    # Step 2: Word Counting
    word_counts = count_words(documents)
    
    # Step 3: Word Indexing
    word_index = create_word_index(word_counts)
    
    # Step 4: Vocabulary Creation
    vocabulary = create_vocabulary(word_index)
    
    # Step 5: Normalization
    normalized_counts = normalize_word_counts(word_counts)
    
    # Get user query
    query = input("Enter your query: ")
    
    # Step 6: Query Handling
    query_vector = process_query(query, word_index)
    
    # Step 7: Query Vectorization
    query_vector_bow = vectorize_query(query, vocabulary)
    
    # Document Vectorization
    document_vectors = [vectorize_query(doc, vocabulary) for doc in documents]
    
    # Document Ranking
    ranked_document_ids = rank_documents(query_vector_bow, document_vectors)
    
    # Document Identification
    top_documents = identify_documents(document_ids, ranked_document_ids)
    
    # User Response
    provide_user_response(top_documents)

if __name__ == "__main__":
    main()



Top Documents:
Document ID: 0
Document ID: 1
Document ID: 2
Document ID: 3
Document ID: 4
