<a href="https://colab.research.google.com/github/abdulrahman-nuzha/a-simple-ir-system/blob/main/IRSProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**IR system**

#Import Lib

In [1]:
#Lib
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Crawler

In [3]:
def crawl(start_url):
    response = requests.get(start_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a', href=True)

    urls = [link['href'] for link in links]
    urls = [url for url in urls if not url.startswith('http')]
    urls = [start_url + url if not url.startswith('/') else start_url[:-1] + url for url in urls]

    return urls

# Get Page Contents

In [4]:
def content_web_page(url):

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = soup.find_all('p')
    text_content = ' '.join([p.get_text() for p in paragraphs])
    return text_content

# Get Page links

In [5]:
def get_outgoing_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')

    outgoing_links = []
    for link in links:
        href = link.get('href')
        if href and href.startswith('http'):
            outgoing_links.append(href)

    return outgoing_links

# Page Rank

In [6]:
def calculate_page_rank(url_list, num_iterations=10, damping_factor=0.85):
    # Initialize PageRank scores
    page_rank_scores = {url: 1.0 for url in url_list}

    # Perform power iteration
    for _ in range(num_iterations):
        new_scores = {}
        total_score = 0.0

        for url in url_list:
            new_score = (1 - damping_factor)
            for linking_url in url_list:
                if url in get_outgoing_links(linking_url):
                    num_links = len(get_outgoing_links(linking_url))
                    new_score += damping_factor * (page_rank_scores[linking_url] / num_links)

            new_scores[url] = new_score
            total_score += new_score

        # Normalize scores
        for url in url_list:
            new_scores[url] /= total_score

        page_rank_scores = new_scores

    return page_rank_scores

# Preprocessing

In [7]:
def clean_text(text):
      text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text)
      text = text.lower()
      text = re.sub(r',', '', text)
      text = re.sub(r'\'', '',  text)
      text = re.sub(r'\"', '', text)
      text = re.sub(r'\(', '', text)
      text = re.sub(r'\)', '', text)
      text = re.sub(r'\n', '', text)
      text = re.sub(r'“', '', text)
      text = re.sub(r'”', '', text)
      text = re.sub(r'’', '', text)
      text = re.sub(r'\.', '', text)
      text = re.sub(r';', '', text)
      text = re.sub(r':', '', text)
      text = re.sub(r'\-', '', text)
      text = re.sub(r'[^\w\s]','',text,flags=re.MULTILINE)
      emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
      text=emoji_pattern.sub(r'', text)

      return text

In [8]:
def preprocess_text(text):
    text = clean_text(text)

    lemmatizer = WordNetLemmatizer()

    stop_words_en = set(stopwords.words("english"))
    cleaned_words = list()
    cleaned_words.append([word for word in word_tokenize(text) if word not in stop_words_en])

    tokens = []
    for i in cleaned_words[0]:
        lemma = lemmatizer.lemmatize(i)
        tokens.append(lemma)

    document = ' '.join(tokens)

    return tokens,document

#Building the Inverted Index

In [55]:
def build_inverted_index(start_url, limit):
    inverted_index = defaultdict(list)
    documents = []

    url_list = crawl(start_url)
    url_list = url_list[:limit]

    page_rank_scores = calculate_page_rank(url_list)

    for index, url in enumerate(url_list):
        content = content_web_page(url)
        tokens, doc = preprocess_text(content)
        documents.append(doc)

        # Store document postings in the inverted index
        for token in tokens:
            # Check if the document ID is already in the postings list for the token
            doc_exists = any(entry['url'] == url for entry in inverted_index[token])

            if not doc_exists:
                # Add a new document posting for the token
                inverted_index[token].append({
                    'document_id': index,
                    'url': url,
                    'page_rank_score': page_rank_scores[url],
                })

    return inverted_index, documents


In [66]:
# inverted_index,documents = build_inverted_index('https://nlp.stanford.edu/IR-book/',20)
inverted_index,documents = build_inverted_index('https://www.geeksforgeeks.org/c-plus-plus/?ref=shm',5)

In [67]:
len(documents)

5

In [68]:
inverted_index

defaultdict(list,
            {'c': [{'document_id': 0,
               'url': 'https://www.geeksforgeeks.org/c-plus-plus/?ref=shm#main',
               'page_rank_score': 0.2},
              {'document_id': 1,
               'url': 'https://www.geeksforgeeks.org/c-plus-plus/?ref=shmjavascript:void(0)',
               'page_rank_score': 0.4740740740740739}],
             'used': [{'document_id': 0,
               'url': 'https://www.geeksforgeeks.org/c-plus-plus/?ref=shm#main',
               'page_rank_score': 0.2},
              {'document_id': 1,
               'url': 'https://www.geeksforgeeks.org/c-plus-plus/?ref=shmjavascript:void(0)',
               'page_rank_score': 0.4740740740740739}],
             'popular': [{'document_id': 0,
               'url': 'https://www.geeksforgeeks.org/c-plus-plus/?ref=shm#main',
               'page_rank_score': 0.2},
              {'document_id': 1,
               'url': 'https://www.geeksforgeeks.org/c-plus-plus/?ref=shmjavascript:void(0)',
   

#Algorithms

## TF-IDF

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer

inverted_index_tfidf = defaultdict(list)
# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

for doc_index, doc in enumerate(documents):
    # Get the TF-IDF scores for the terms in the document
    feature_index = tfidf_matrix[doc_index, :].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[doc_index, x] for x in feature_index])


#     for term, score in [(feature_names[i], score) for (i, score) in tfidf_scores]:
#         inverted_index_tfidf[term].append(score)
#         print(f"Term: {term}, TF-IDF score: {score:.4f}")
#     print('-' * 40)


In [60]:
def get_urls(user_terms,doc_index):
    urls = []
    for terms in user_terms:
        for term in word_tokenize(terms):
            if term in documents[doc_index]:
                for i in inverted_index[term]:
                    if i['document_id'] == doc_index and i['url'] not in urls:
                        urls.append(i['url'])
    return urls

In [61]:
def search_tfidf(user_query):
    user_query = [user_query]
    # Transform the user query into TF-IDF representation
    query_vector = vectorizer.transform(user_query)

    # Calculate the cosine similarity between the query vector and document vectors
    cosine_similarities = tfidf_matrix.dot(query_vector.T).toarray().flatten()

    # Sort the documents based on the cosine similarity scores
    sorted_indices = cosine_similarities.argsort()[::-1]

    not_found = True
    # Print the ranked results
    for rank, doc_index in enumerate(sorted_indices):
        if cosine_similarities[doc_index] != 0:
            urls = get_urls(user_query,doc_index)
            if urls:
                print(f"Rank: {rank+1}, Document: {doc_index}")
                print()
                print(f"Cosine Similarity: {cosine_similarities[doc_index]:.4f}")
                print()
                print(urls)
                not_found = False
    if not_found:
        print("not found")

##Page Rank

In [62]:
def search_pageRank(query):
    query_results = []

    #Preprocessing the query
    query,_ = preprocess_text(query)

    for token in query:
      for item in inverted_index[token]:
        query_results.append(item)

    # Sort the query results based on PageRank scores
    ranked_results = sorted(query_results, key=lambda x: x['page_rank_score'], reverse=True)

    ranked_results = list({tuple(sorted(item.items())): item for item in ranked_results}.values())

    if ranked_results:
      # Print the search results
      for rank,result in enumerate(ranked_results):
        print(f"Rank: {rank+1}, Document: {result['document_id']}")
        print()
        print(f"PageRank Score: {result['page_rank_score']:.4f}")
        print()
        print(result['url'])
    else:
      print("not found")

# Search

In [63]:
def search(query,method):
  if method == "tfidf":
    search_tfidf(query)
  elif method == "page rank":
    search_pageRank(query)
  else:
    print("Sorry this method is not avaliable for now")

In [64]:
search("programming link","tfidf")

Rank: 3, Document: 1

Cosine Similarity: 0.3846

['https://www.geeksforgeeks.org/c-plus-plus/?ref=shmjavascript:void(0)']
Rank: 4, Document: 0

Cosine Similarity: 0.3846

['https://www.geeksforgeeks.org/c-plus-plus/?ref=shm#main']


In [65]:
search("programming link","page rank")

Rank: 1, Document: 1

PageRank Score: 0.6944

https://www.geeksforgeeks.org/c-plus-plus/?ref=shmjavascript:void(0)
Rank: 2, Document: 0

PageRank Score: 0.2500

https://www.geeksforgeeks.org/c-plus-plus/?ref=shm#main
