In [None]:
from google.colab import drive
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
drive.mount('/content/drive')
% cd /content/drive/MyDrive/cran/

Mounted at /content/drive
/content/drive/MyDrive/cran


In [None]:
import re
def remove_symbols(line):
  return re.sub('[^A-Za-z0-9\s]+', '', line).lower()

In [None]:
# Create Posting List for title and body zones
# posting is for body
# posting_t is for title

posting = {}
posting_t = {}

with open('IR3_OUTPUTS/OP0.txt') as f:
  next(f)
  for line in f:
    word, *docs = line.strip().split(' ')

    posting[word] = set(docs)

with open('IR3_OUTPUTS/titles.txt') as f:
  next(f)
  for line in f:
    word, *docs = line.strip().split(' ')

    posting_t[word] = set(docs)

In [None]:
def find_docs(word1, *words):
  res = posting[word1]

  for word in words:
    if word[0:2] == '~ ':
      res = res.difference(posting[word[2:]])
    elif word[0:2] == '* ':
      res = res.intersection(posting[word[2:]])
    elif word[0:2] == '| ':
      res = res.union(posting[word[2:]])
    else:
      print("Invalid operator")
      break

  return list(res)
  # return doc_id_to_title(list(res))


print(find_docs('doppler', '| divers', '| perturb'))

['26', '1110', '1370', '496', '73', '1248', '447', '1154', '1224', '499', '305', '299', '70', '131', '149', '613', '916', '903', '21', '1203', '660', '324', '371', '129']


In [None]:
import numpy as np

def find_doc_from_query(terms):
  # Use this function to compute the matrix
  
  weighted = np.zeros((1400, 2)) # 1400 -> no of docs
  # weights = np.array([0.3, 0.7])
  # weights = np.reshape(weights, (-1, 1))

  title_docs = find_docs_from_zone(terms, zone='title')
  body_docs = find_docs_from_zone(terms)

  for t in title_docs:
    if t == '':
      continue
    weighted[int(t)-1][0] = 1.
  for b in body_docs:
    weighted[int(b)-1][1] = 1.
  
  # print(weighted[328])
  # for i in weighted:
  #   if i[0] == 1 and i[1] == 1:
  #     print(i)
  # print(title_docs)
  # print(body_docs)

  # res = np.dot(weighted, weights)
  return weighted

In [None]:
def find_docs_from_zone(terms, zone='body'):
  # word1 = terms[0]
  if zone == 'body':
    res = posting[terms[0]]
  elif zone == 'title':
    res = posting_t[terms[0]]

  for word in terms[1:]:
    if zone == 'body':
      res = res.union(posting.get(word, ''))
    elif zone == 'title':
      res = res.union(posting_t.get(word, ''))

  return res

In [None]:
query = 'what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft .'

res = ''
for x in remove_symbols(query).strip().split():
  if x not in stop_words:
    res += ' ' + porter.stem(x)

weighted = find_doc_from_query(res.strip().split())
# print(find_doc_from_query(res.strip().split()))
# res.strip()

In [None]:
# This code snippet is only for first query


rel = np.zeros((1400, 1))
with open(f'cranqrel') as f:
  for line in f:
    row = line.strip().split()
    # print(row[0])
    if not row[0] == str(1):
      continue
    rel[int(row[1])-1][0] = 1.


In [None]:
a = np.linalg.pinv(weighted)  # this is showing error, (1400, 2) matrix is not invertible ig, so I took the pseudo inverse of weighted
result = np.dot(a, rel)
result  # but the result is showing [0 0]T

array([[0.05374606],
       [0.0125    ]])

In [None]:
BLOCK_SIZE = 100000

In [None]:
# Block-Sort Based Indexing Algorithm Implementation

from collections import defaultdict

def cran_bsbi():
  freq_dict = defaultdict(set)
  id_to_title = defaultdict(set)
  doc_id = 0
  total_files = 0
  current_block = 0
  A = 0
  B = 0
  T = 0
  i = 0

  with open('cran.all.1400') as f:
    for line in f:

      # if i < 5:
      #   print(line)
      # else:
      #   break
      i = i + 1

      if line[0:2] == '.I':
        doc_id = int(line[3:].strip())
        A = 0
        B = 0
        continue

      if line[0:2] == '.T':
        # id_to_title[word] = ''
        T = 1
        continue
      
      if line[0:2] == '.A':
        A = 1
        T = 0
        continue
      
      if line[0:2] == '.B':
        B = 1
        A = 0
        continue

      if line[0:2] == '.W':
        B = 0

      if A == 1 or B == 1:
        continue
      

      for word in line.split():
        word = remove_symbols(word)
        if word and word not in stop_words:
          word = porter.stem(word)
          
          if word not in freq_dict:
            current_block += 1

          if not freq_dict[word].__contains__(doc_id):
            freq_dict[word].add(doc_id)
            current_block += 1

          if T == 1 and not id_to_title[word].__contains__(doc_id):
            id_to_title[word].add(doc_id)
        
        if current_block >= BLOCK_SIZE:
          sorted_list = sorted(freq_dict.items(), key= lambda _: _[0])

          with open(f'./IR3_OUTPUTS/OP{total_files}.txt', 'w') as f:
            for word, doc_ids in sorted_list:
              f.write(word)
              for id in doc_ids:
                f.write(f' {id}')
              f.write('\n')

          current_block = 0
          freq_dict.clear()
          total_files += 1
          print (i + ' rows done!')

    sorted_list = sorted(freq_dict.items(), key= lambda _: _[0])
    sorted_title_list = sorted(id_to_title.items(), key= lambda _: _[0])

    if len(sorted_list) > 0:
      with open(f'./IR3_OUTPUTS/OP{total_files}.txt', 'w') as f:
        for word, doc_ids in sorted_list:
          f.write(word)
          for id in doc_ids:
            f.write(f' {id}')
          f.write('\n')
      
    if len(sorted_title_list) > 0:
      with open(f'./IR3_OUTPUTS/titles.txt', 'w') as f:
        for word, doc_ids in sorted_title_list:
          f.write(word)
          for id in doc_ids:
            f.write(f' {id}')
          f.write('\n')

      current_block = 0
      freq_dict.clear()
      total_files += 1

cran_bsbi()

In [None]:
# Weighted Zone Scoring
query = 'what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft .'

res = ''
for x in remove_symbols(query).strip().split():
  if x not in stop_words:
    res += ' ' + porter.stem(x)

scores = find_doc_from_query(res.strip().split())
# print(find_doc_from_query(res.strip().split()))
# res.strip()

{'119', '260', '631', '1191', '550', '1168', '1303', '797', '792', '81', '70', '508', '944', '55', '283', '586', '1395', '725', '615', '671', '518', '54', '655', '1050', '399', '1159', '1264', '179', '1274', '638', '303', '1164', '790', '215', '269', '209', '1341', '1062', '82', '1161', '555', '519', '1163', '754', '711', '799', '877', '662', '962', '500', '1165', '916', '982', '181', '485', '1226', '139', '349', '37', '413', '359', '387', '315', '7', '959', '292', '141', '1166', '685', '1114', '924', '51', '253', '248', '302', '251', '442', '174', '252', '1281', '23', '147', '651', '593', '391', '1104', '270', '53', '687', '1201', '435', '430', '1212', '566', '983', '1107', '879', '563', '486', '666', '679', '755', '242', '1213', '395', '872', '559', '1197', '546', '572', '1386', '431', '1207', '358', '493', '513', '1089', '665', '707', '1192', '1393', '1084', '914', '1064', '1219', '211', '781', '817', '1268', '22', '1098', '945', '814', '80', '244', '670', '536', '700', '1320', '102

In [None]:
diff = np.subtract(rel, scores)
sum = np.sum(diff, axis=0)
sum
# error = sum**2
# error[0]

array([-624.])

In [None]:
len(posting)

6391