In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive

Mounted at /content/drive
/content/drive/MyDrive


**Part:1**

In [2]:
# importing required libraries
import os
import sys
import re
import nltk
from nltk.stem.isri import ISRIStemmer
import zipfile
import collections
import pandas as pd

# function to read the stopwords file
def read_stopwords():
    stopwords = {}
    with open('Urdu stopwords.txt', 'r') as f:
        for line in f:
            stopwords[line.strip()] = 1
    stopwords[''] = 1 # to remove empty strings
    return stopwords

# getting max file number in the directory
def get_max_file_number():
    max_file_number = 0
    for file in os.listdir('Documents'):
        if file.endswith('.txt'):
            try:
                file_number = int(file.split('.')[0])
            except ValueError:
                continue
            if file_number > max_file_number:
                max_file_number = file_number
    return max_file_number


# reading a document and doing following:
# 1. removing punctuations
# 2. removing stopwords
# 3. stemming
# maintaining a dictionary of {term: term_id}
def read_document(file_number, stopwords, term_id):
    hash = {}
    file_name = 'Documents/' + str(file_number) + '.txt'
    with open(file_name, 'r') as f:
        text = f.read()
        text = re.sub(r'[^ء-ی]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.split(' ')
        #text = [word for word in text if word not in stopwords]
        #st = ISRIStemmer()
        #text = [st.stem(word) for word in text]
        count = 0
        for word in text:
            if word not in term_id:
                term_id[word] = len(term_id)
            # creating a dictionary of {term_id: [positions]}
            if term_id[word] not in hash:
                hash[term_id[word]] = [count]
            else:
                hash[term_id[word]].append(count)
            count += 1
    return hash

# creating an overall hash
def create_overall_hash():
    hash = {}
    max_file_number = get_max_file_number()
    stopwords = read_stopwords()
    term_id = {}
    for file_number in range(1, max_file_number + 1):
        try:
            hash[file_number] = read_document(file_number, stopwords, term_id)
        except:
            pass
    return hash, term_id

# function to write to termids.txt
def write_termids(term_id):
    with open('termids.txt', 'w') as f:
        for key, value in term_id.items():
            f.write(f"{value}\t{key}\n")


# function to write to docids.txt
def write_docids(hash):
    with open('docids.txt', 'w') as f:
        count = 1
        for key in hash.keys():
            f.write(f"{key}\t{count}\n")
            count += 1

# function to write to doc_index.txt
def write_doc_index(hash):
    with open('doc_index.txt','w') as f:
        for docid,val in hash.items():
            for key, value in val.items():
                f.write(f"{docid}\t{key}\t")
                for i in value:
                    f.write(f"{i}\t")
                f.write("\n")

if __name__ == '__main__':
    hash, terms = create_overall_hash()
    write_doc_index(hash)
    write_docids(hash)
    write_termids(terms)
    

**Part:2**

In [3]:
# function to write to term_index.txt
def write_term_index(hash):
    with open('term_index.txt', 'w') as f:
        for termid, val in hash.items():
            f.write(f"{termid}\t")
            for key, value in val.items():
                f.write(f"{key}:")
                f.write(f"{value[0]}\t")
                for i in range(1, len(value)):
                    f.write(f"0:{value[i] - value[i - 1]}\t")
            f.write("\n")

if __name__ == '__main__':
    write_term_index(hash)

In [4]:
def write_term_info(hash):
    with open('term_info.txt', 'w') as f:
        for termid, val in hash.items():
            f.write(f"{termid}\t")
            total_occurrences = 0
            total_documents = 0
            for key, value in val.items():
                total_documents += 1
                total_occurrences += len(value)
            f.write(f"{total_occurrences}\t{total_documents}\n")


if __name__ == '__main__':
    write_term_info(hash)


**Part:3**

In [None]:
import argparse

# function to read term_info.txt
def read_term_info():
    hash = {}
    with open('term_info.txt', 'r') as f:
        for line in f:
            line = line.split('\t')
            hash[int(line[0])] = [int(line[1]), int(line[2])]
    return hash


# function to read docids.txt
def read_docids():
    hash = {}
    with open('docids.txt', 'r') as f:
        for line in f:
            line = line.split('\t')
            hash[int(line[1])] = int(line[0])
    return hash


# function to read termids.txt

def read_termids():
    hash = {}
    with open('termids.txt', 'r') as f:
        for line in f:
            line = line.split('\t')
            hash[line[1].strip()] = int(line[0])
    return hash


# function to read term_index.txt
def read_term_index():
    hash = {}
    with open('term_index.txt', 'r') as f:
        for line in f:
            line = line.split('\t')
            termid = int(line[0])
            hash[termid] = {}
            for i in range(1, len(line)):
                line[i] = line[i].split(':')
                docid = int(line[i][0])
                hash[termid][docid] = []
                line[i] = line[i][1].split(',')
                for j in range(len(line[i])):
                    hash[termid][docid].append(int(line[i][j]))
    return hash


# function to read doc_index.txt
def read_doc_index():
    hash = {}
    with open('doc_index.txt', 'r') as f:
        for line in f:
            line = line.split('\t')
            docid = int(line[0])
            termid = int(line[1])
            hash[docid] = {}
            hash[docid][termid] = []
            for i in range(2, len(line)):
                hash[docid][termid].append(int(line[i]))
    return hash



# function to get the termid
def get_termid(term):
    hash = read_termids()
    if term in hash:
        return hash[term]
    else:
        return -1
    

# function to get the docid
def get_docid(doc):
    hash = read_docids()
    if doc in hash:
        return hash[doc]
    else:
        return -1
    

# function to get the term frequency in corpus
def get_term_frequency(termid):
    hash = read_term_info()
    return hash[termid][0]


# function to get the number of documents containing term
def get_num_docs(termid):
    hash = read_term_info()
    return hash[termid][1]


# function to get the inverted list offset
def get_inverted_list_offset(termid):
    hash = read_term_index()
    return hash[termid]


# function to get the term frequency in document
def get_term_frequency_in_doc(termid, docid):
    hash = read_doc_index()
    return len(hash[docid][termid])


# function to get the positions
def get_positions(termid, docid):
    hash = read_doc_index()
    return hash[docid][termid]


# function to get the distinct terms
def get_distinct_terms(docid):
    hash = read_doc_index()
    return len(hash[docid])


# function to get the total terms
def get_total_terms(docid):
    hash = read_doc_index()
    total = 0
    for termid in hash[docid]:
        total += len(hash[docid][termid])
    return total

#write main function
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--doc', type=str, help='doc name')
    parser.add_argument('--term', type=str, help='term')
    args = parser.parse_args()
    if args.doc and args.term:
        termid = get_termid(args.term)
        docid = get_docid(args.doc)
        if termid == -1 or docid == -1:
            print('Invalid term or doc')
            return
        print('Inverted list for term:', args.term)
        print('In document:', args.doc)
        print('TERMID:', termid)
        print('DOCID:', docid)
        print('Term frequency in document:', get_term_frequency_in_doc(termid, docid))
        print('Positions:', get_positions(termid, docid))
    elif args.doc:
        docid = get_docid(args.doc)
        if docid == -1:
            print('Invalid doc')
            return
        print('Listing for document:', args.doc)
        print('DOCID:', docid)
        print('Distinct terms:', get_distinct_terms(docid))
        print('Total terms:', get_total_terms(docid))
    elif args.term:
        termid = get_termid(args.term)
        if termid == -1:
            print('Invalid term')
            return
        print('Listing for term:', args.term)
        print('TERMID:', termid)
        print('Number of documents containing term:', get_num_docs(termid))
        print('Term frequency in corpus:', get_term_frequency(termid))
        print('Inverted list offset:', get_inverted_list_offset(termid))
    else:
        print('Invalid input')

if __name__ == '__main__':
    main()
    


usage: ipykernel_launcher.py [-h] [--doc DOC] [--term TERM]
ipykernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-c79e8df0-28bd-4673-a497-041add5d21f7.json


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
