In [1]:
import glob
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import string
from collections import Counter
import numpy as np
from collections import OrderedDict

In [2]:
def give_path(fld_path):                             #give path of the folder containing all documents
    dic = {}
    file_names = glob.glob(fld_path)
    files_150 = file_names[0:10]
    for file in files_150:
        name = file.split('/')[-1]
        with open(file, 'r', errors='ignore') as f:
            data = f.read()
        dic[name] = data
    return dic

In [3]:
def wordList_removePuncs(doc_dict):
    stop = stopwords.words('english') + list(string.punctuation) + ['\n']
    wordList = []
    for doc in doc_dict.values():
        for word in word_tokenize(doc.lower().strip()): 
            if not word in stop:
                wordList.append(word)
    return wordList

In [4]:
def termFrequencyInDoc(vocab, doc_dict):
    tf_docs = {}
    for doc_id in doc_dict.keys():
        tf_docs[doc_id] = {}
    
    for word in vocab:
        for doc_id,doc in doc_dict.items():
            tf_docs[doc_id][word] = doc.count(word)
    return tf_docs

In [5]:
def wordDocFre(vocab, doc_dict):
    df = {}
    for word in vocab:
        frq = 0
        for doc in doc_dict.values():
#             if word in doc.lower().split():
            if word in word_tokenize(doc.lower().strip()):
                frq = frq + 1
        df[word] = frq
    return df

In [6]:
def inverseDocFre(vocab,doc_fre,length):
    idf= {} 
    for word in vocab:     
        idf[word] = np.log2((length+1) / doc_fre[word])
    return idf

In [7]:
def tfidf(vocab,tf,idf_scr,doc_dict):
    tf_idf_scr = {}
    for doc_id in doc_dict.keys():
        tf_idf_scr[doc_id] = {}
    for word in vocab:
        for doc_id,doc in doc_dict.items():
            tf_idf_scr[doc_id][word] = tf[doc_id][word] * idf_scr[word]
    return tf_idf_scr

In [30]:
def vectorSpaceModel(query, doc_dict,tfidf_scr):
    query_vocab = []
    for word in query.split():
        if word not in query_vocab:
            query_vocab.append(word)

    query_wc = {}
    for word in query_vocab:
        query_wc[word] = query.lower().split().count(word)
    print(query_vocab)
    relevance_scores = {}
    for doc_id in doc_dict.keys():
        score = 0
        for word in query_vocab:
            print(query_wc[word] , tfidf_scr[doc_id][word])
            score += query_wc[word] * tfidf_scr[doc_id][word]
        relevance_scores[doc_id] = score
    sorted_value = OrderedDict(sorted(relevance_scores.items(), key=lambda x: x[1], reverse = True))
    top_5 = {k: sorted_value[k] for k in list(sorted_value)[:5]}
    return top_5

In [9]:
path = 'test/*.txt'
docs = give_path(path)                        #returns a dictionary of all docs
M = len(docs)                                 #number of files in dataset
w_List = wordList_removePuncs(docs)           #returns a list of tokenized words
vocab = list(set(w_List))                     #returns a list of unique words
tf_dict = termFrequencyInDoc(vocab, docs)     #returns term frequency
df_dict = wordDocFre(vocab, docs)             #returns document frequencies
idf_dict = inverseDocFre(vocab,df_dict,M)     #returns idf scores
tf_idf = tfidf(vocab,tf_dict,idf_dict,docs)   #returns tf-idf socres

In [10]:
tf_idf

{'business.txt': {'adolf': 0.0,
  'presented': 0.0,
  'aware': 2.0,
  'duty': 0.0,
  'innocence': 2.0,
  'began': 0.0,
  'co-ordinator': 0.0,
  'faces': 2.0,
  'declared': 2.0,
  'heart': 0.0,
  'office': 1.0,
  'stand': 0.0,
  'wants': 0.0,
  'drew': 0.0,
  'basic': 0.0,
  'boring': 0.0,
  'alliances': 2.0,
  'answers': 0.0,
  '189': 2.0,
  'assistant': 0.0,
  'westminster': 0.0,
  'exit': 2.0,
  'investor': 4.0,
  'cambridge': 0.0,
  'years': 0.8300749985576875,
  'case': 0.0,
  'responsible': 1.0,
  'finance': 0.0,
  'disappoint': 0.0,
  'growing': 0.0,
  'take': 6.0,
  'nominations': 0.0,
  'beginning': 0.0,
  '30': 0.0,
  'laboratory': 0.0,
  'boost': 1.0,
  'forced': 2.0,
  '2002.': 0.0,
  'reunited': 0.0,
  'version': 0.0,
  'third-largest': 2.0,
  'saying': 0.0,
  'telecoms': 4.0,
  'conclude': 2.0,
  'four-year': 0.0,
  'society': 0.0,
  'consumer': 4.0,
  'sophie': 0.0,
  'dubbed': 0.0,
  'met': 4.0,
  'initially': 0.0,
  'framework': 0.0,
  'parkinson': 0.0,
  'polar': 0.0,


In [18]:
docs

{'business.txt': '"worldcom boss  left books alone  former worldcom boss bernie ebbers  who is accused of overseeing an $11bn (£5.8bn) fraud  never made accounting decisions  a witness has told jurors.  david myers made the comments under questioning by defence lawyers who have been arguing that mr ebbers was not responsible for worldcom s problems. the phone company collapsed in 2002 and prosecutors claim that losses were hidden to protect the firm s shares. mr myers has already pleaded guilty to fraud and is assisting prosecutors.  on monday  defence lawyer reid weingarten tried to distance his client from the allegations. during cross examination  he asked mr myers if he ever knew mr ebbers  make an accounting decision  .  not that i am aware of   mr myers replied.  did you ever know mr ebbers to make an accounting entry into worldcom books   mr weingarten pressed.  no   replied the witness. mr myers has admitted that he ordered false accounting entries at the request of former worl

In [32]:
top5 = vectorSpaceModel("although below last year", docs,tf_idf)    #returns top 5 documents using VSM
print('Top 5 Documents for Query 1: \n', top5)
print('\n')

['although', 'below', 'last', 'year']
1 1.0


KeyError: 'below'