### Imports

In [1]:
import os
import nltk
import pickle
import re
from nltk.tokenize import word_tokenize
import string
import collections
import math
import operator

#### Helper functions for cleaning corpus

In [2]:
regex = "["+string.punctuation+"]*"
timestamp_regex1 = "[A-Za-z]+\s[0-9]+,\s[0-9]+\s+[0-9]+:[0-9]+\s[AM]*[PM]*"
timestamp_regex2 ="[A-Za-z]+,\s[0-9]+"
# Remove Punctuation
def remove_punctuation(text):
    text = re.sub(regex, '', text)
    return text


# Clean the document content
def clean_content(text):
    
    # Remove html code
    # Rvery document in this corpus contains only two kinds
    # of HTML tags
    
    text = re.sub('<html>', '', text)
    text = re.sub('<pre>', '', text)
    text = re.sub('</html>', '', text)
    text = re.sub('</pre>', '', text)
    
    # Remove timestamp
    text = re.sub(timestamp_regex1, '', text)
    
    # Remove timestamp
    text = re.sub(timestamp_regex2, '', text)
    
    # Remove punctuation
    text = remove_punctuation(text)
    
    # Remove numbers towards the end of the file
    text = re.sub('[0-9]+\s[0-9]+\s[0-9]+','', text)
    
    # Split into tokens
    tokens = word_tokenize(text.lower())
    
    return tokens

### Load the corpus

In [4]:
# cwd = os.getcwd()+"/cacm/"
# list_dir = os.listdir(cwd)

# corpus_dict = {}

# print("Processing all files")

# for l in list_dir:
#     f = open(cwd+l, "r+")
#     doc_id = l.split(".html")
#     doc_content=clean_content(f.read())
#     corpus_dict[doc_id[0]]=doc_content
    
# print("Writing to pickle file")
# pickle.dump(corpus_dict, open(os.getcwd()+"/corpus.p", "wb"))

corpus_dict = dict(pickle.load(open(os.getcwd()+"/corpus.p", "rb"), encoding="utf-8"))
print("corpus loaded")

corpus loaded


### Generate Document Frequencies (DF)

In [5]:
# print("Constructing inverse map for unigrams for files ")
# unigram_dict_inverse={}
# unigram_dict={}
# for c, words in corpus_dict.items():
#     unigram_dict_inverse[c] = dict(collections.Counter(words))
# print("Inverse map constructed") 

# print("Constructing unigrams index")
# for c, words in unigram_dict_inverse.items():
#     for word, freq in words.items():
#         if word not in unigram_dict:
#             unigram_dict[word] = {}
#         temp_dict = unigram_dict[word]
#         temp_dict[c] = freq
#         unigram_dict[word]=temp_dict
    
# print("Unigrams inverted index constructed!")
# pickle.dump(unigram_dict, open("unigrams.p","wb"))


# Write Unigrams Inverted Index to file
# print("Writing Unigrams inverted index to file")
# f = open("Unigrams.txt","w+")
# for k, v in unigram_dict.items():
#     f.write(k+" : "+str(v)+"\n")
# f.close()



Done


In [None]:
# Load unigrams df

unigram_dict=dict(pickle.load(open(os.getcwd()+"/unigrams.p", "rb"), encoding="utf-8"))
print("Unigrams loaded")

In [6]:
# Generate Term Frequencies (IDF)

# unigram_freq_table={}
# print("Constructing an inverted index from unigrams")
# f=open("Unigrams_tf_table.txt","w+")

# # Constructing the term frequency table
# for k, values in unigram_dict.items():
#     unigram_freq_table[k]=sum(values.values())
    
# print("Frequencies calculated")
# for k, v in unigram_freq_table.items():
#     f.write(str((k, v))+"\n")
    
# f.close()


# pickle.dump(unigram_dict, open("unigrams_tf.p","wb"))

In [None]:
# Load Term Frequencies
unigram_freq_table=dict(pickle.load(open(os.getcwd()+"/unigrams_tf.p", "rb"), encoding="utf-8"))
print("Term Frequencies loaded")

In [12]:
# Calculate IDF
# IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

idf={}
N = len(corpus_dict)
for term in unigram_dict.keys():
    idf[term] = math.log(N/len(unigram_freq_table[term].keys()))
    
print("IDF calculated!")
#TODO : Write to PICKLE FILE and load

Done


In [13]:
# Calculate TF-IDF for every document per query
# Sample query - Automatic Implementation

query = "Automatic Implementation"
ranked_documents = {}
unigram_dict['automatic']
doc_list = []
for terms in word_tokenize(query.lower()):
    
    # Retrieve inverted list for term
    for k in unigram_dict[terms].keys():
        if k not in doc_list:
            doc_list.append(k)
        

for d in doc_list:
    doc_len = len(corpus_dict[d])
    score = 0.0
    for term in word_tokenize(query.lower()):
        if d in unigram_dict[term].keys():
            tf_term = unigram_dict[term][d]/doc_len
            idf_term = idf[term]
            score+= tf_term * idf_term
        
    ranked_documents[d]=score


In [14]:
len(ranked_documents)

# sort by rank 
rank_sort = sorted(ranked_documents.items(), key=operator.itemgetter(1), reverse=True)

output=""
f = open("q_test.txt","w+")
for i in range(100):
    doc, score = rank_sort[i]
    output+=str(doc)+" : "+str(score)+"\n"
f.write(output)
f.close()
#output

'CACM-2415 : 0.3564881978383928\nCACM-0145 : 0.3327223179825\nCACM-0034 : 0.3327223179825\nCACM-0193 : 0.27726859831874995\nCACM-0273 : 0.27726859831874995\nCACM-0018 : 0.25594024460192305\nCACM-1624 : 0.2464609762833333\nCACM-2942 : 0.23397826052275494\nCACM-0098 : 0.2079514487390625\nCACM-0189 : 0.2079514487390625\nCACM-2239 : 0.2079514487390625\nCACM-2074 : 0.20164988968636363\nCACM-0329 : 0.2007807091273707\nCACM-0022 : 0.1901073366747384\nCACM-0987 : 0.18484573221249997\nCACM-2024 : 0.17511700946447367\nCACM-0080 : 0.16636115899125\nCACM-0274 : 0.16636115899125\nCACM-0400 : 0.15208586933979074\nCACM-1682 : 0.14717987355463616\nCACM-2585 : 0.14484368508551496\nCACM-0097 : 0.14466187738369565\nCACM-1044 : 0.14466187738369565\nCACM-1087 : 0.12797012230096153\nCACM-3184 : 0.11882939927946427\nCACM-1352 : 0.11674467297631577\nCACM-1457 : 0.11628024041313226\nCACM-0059 : 0.1109074393275\nCACM-3031 : 0.1041684036573909\nCACM-1252 : 0.10315067815367887\nCACM-2193 : 0.10139057955986047\nCA

### Task 3 

#### Removing stop words from the corpus

In [16]:
stop_words = [word.rstrip('\n') for word in open('common_words')]
#stop_words

In [20]:
diff = lambda l1,l2: [x for x in l1 if x not in l2]

In [107]:
stopped_corpus = {}

for doc, content in corpus_dict.items():
    stopped_corpus[doc] = diff(content, stop_words)

print(len(stopped_corpus))

#stopped_corpus

3204


{'CACM-0270': ['techniques',
  'storage',
  'allocation',
  'algorithms',
  'cacm',
  'kelley',
  'jr',
  'ca611011',
  'jb'],
 'CACM-1898': ['regular',
  'coulomb',
  'wave',
  'functions',
  'algorithm',
  '292',
  's22',
  'cacm',
  'kolbig',
  'coulomb',
  'wave',
  'functions',
  'wave',
  'functions',
  'regular',
  'coulomb',
  'wave',
  'functions',
  '512',
  'ca690511',
  'jb'],
 'CACM-1932': ['logarithmic',
  'error',
  'newtons',
  'method',
  'square',
  'root',
  'problem',
  'obtaining',
  'optimal',
  'starting',
  'values',
  'calculation',
  'square',
  'root',
  'newtons',
  'method',
  'considered',
  'pointed',
  'relative',
  'error',
  'measure',
  'goodness',
  'fit',
  'optimal',
  'results',
  'obtained',
  'initial',
  'approximation',
  'fit',
  'shown',
  'socalled',
  'logarithmic',
  'error',
  'initial',
  'fit',
  'optimal',
  'types',
  'error',
  'logarithmic',
  'error',
  'appears',
  'simplify',
  'problem',
  'determining',
  'optimal',
  'initial

In [109]:
pickle.dump(stopped_corpus, open("stopped_corpus.p","wb"))
print("Stopped corpus written to file")

#TODO - load stopped corpus the next time

Stopped corpus written to file


#### Using stemmed corpus

In [100]:
# Reading and parsing the stemmed corpus
# provided in cacm_stem.txt
stemmed_corpus_temp = {}
with open('cacm_stem.txt') as f:
    content = f.readlines()
#print(content)
pattern = '#\s[0-9]+'
for item in content:
    if re.match(pattern, item) :
        doc_id = re.split('#\s', item.strip())
        doc_id = doc_id[1]
        #print(doc_id)
        stemmed_corpus_temp[doc_id]=[]
    else:
        stemmed_corpus_temp[doc_id].append(item.strip())
stemmed_corpus_temp

{'1': ['preliminari report intern algebra languag cacm decemb 1958 perli a',
  'j samelson k ca581203 jb march 22 1978 8 28',
  'pm 100 5 1 123 5 1 164 5 1',
  '1 5 1 1 5 1 1 5 1 205',
  '5 1 210 5 1 214 5 1 1982 5',
  '1 398 5 1 642 5 1 669 5 1',
  '1 6 1 1 6 1 1 6 1 1',
  '6 1 1 6 1 1 6 1 1 6',
  '1 1 6 1 1 6 1 1 6 1',
  '165 6 1 196 6 1 196 6 1 1273',
  '6 1 1883 6 1 324 6 1 43 6',
  '1 53 6 1 91 6 1 410 6 1',
  '3184 6 1'],
 '2': ['extract of root by repeat subtract for digit comput cacm',
  'decemb 1958 sugai i ca581202 jb march 22 1978 8',
  '29 pm 2 5 2 2 5 2 2 5',
  '2'],
 '3': ['techniqu depart on matrix program scheme cacm decemb 1958 friedman',
  'm d ca581201 jb march 22 1978 8 30 pm',
  '3 5 3 3 5 3 3 5 3'],
 '4': ['glossari of comput engin and program terminolog cacm novemb 1958',
  'ca581103 jb march 22 1978 8 32 pm 4 5',
  '4 4 5 4 4 5 4'],
 '5': ['two squar root approxim cacm novemb 1958 wadei w g',
  'ca581102 jb march 22 1978 8 33 pm 5 5',
  '5 5 5 5 5 5 5'],
 '6': [

In [108]:
# Process the content in every document
# i.e. remove those strings which contain ONLY numbers and timestamps

# Should we index terms like 'ca581001' ?

stemmed_corpus={}
for doc_id, content in stemmed_corpus_temp.items():
    new_doc_id = "CACM-"+str(doc_id).zfill(4)
    stemmed_corpus[new_doc_id]=[]
    temp_list = []
    #print(str(doc_id).zfill(4))
    flag = 0
    for line in content:
        if re.match('^ *[0-9][0-9 ]*$', line)==None:
            temp_list.append(line)
    
    all_content = " ".join(temp_list)
    split_content = all_content.split('[a-z]+\s[0-9]+\s[0-9]+\s[0-9]+\s[0-9]+\s[am|pm]+')[0]
    final_content=re.sub('[a-z]+\s[0-9]+\s[0-9]+\s[0-9]+\s[0-9]+\s[am|pm]+[\s0-9]*','', split_content)
    #print(final_content)
    stemmed_corpus[new_doc_id].append(final_content)
    
print(len(stemmed_corpus))

stemmed_corpus


3204


{'CACM-0001': ['preliminari report intern algebra languag cacm decemb 1958 perli a j samelson k ca581203 jb '],
 'CACM-0002': ['extract of root by repeat subtract for digit comput cacm decemb 1958 sugai i ca581202 jb '],
 'CACM-0003': ['techniqu depart on matrix program scheme cacm decemb 1958 friedman m d ca581201 jb '],
 'CACM-0004': ['glossari of comput engin and program terminolog cacm novemb 1958 ca581103 jb '],
 'CACM-0005': ['two squar root approxim cacm novemb 1958 wadei w g ca581102 jb '],
 'CACM-0006': ['the us of comput in inspect procedur cacm novemb 1958 muller m e ca581101 jb '],
 'CACM-0007': ['glossari of comput engin and program terminolog cacm octob 1958 ca581003 jb '],
 'CACM-0008': ['on the equival and transform of program scheme cacm octob 1958 friedman m d ca581002 jb '],
 'CACM-0009': ['propos for an uncol cacm octob 1958 conwai m e ca581001 jb '],
 'CACM-0010': ['glossari of comput engin and program terminolog cacm septemb 1958 ca580903 jb '],
 'CACM-0011': ['th

In [103]:
pickle.dump(stemmed_corpus, open("stemmed_corpus.p","wb"))
print("Done")

#TODO - load stemmed corpus the next time

Done
