### Required imports

In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [107]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

### Reading the files available

In [108]:
file_list = os.listdir('./data/data')
file_list.sort()

### Getting the documents

In [109]:
documents = []
for file in file_list:
    with open('./data/data/{}'.format(file)) as f:
        lines = f.readlines()
        document_string = ""
        for line in lines:
            document_string += line
            
        documents.append(document_string)

In [110]:
documents = np.array(documents)


In [111]:
documents_df = pd.DataFrame({"documents":documents})

In [112]:
documents_df

Unnamed: 0,documents
0,Ad sales boost Time Warner profit\n\nQuarterly...
1,Dollar gains on Greenspan speech\n\nThe dollar...
2,Yukos unit buyer faces loan claim\n\nThe owner...
3,High fuel prices hit BA's profits\n\nBritish A...
4,Pernod takeover talk lifts Domecq\n\nShares in...
5,Japan narrowly escapes recession\n\nJapan's ec...
6,Jobs growth still slow in the US\n\nThe US cre...
7,"India calls for fair trade rules\n\nIndia, whi..."
8,Ethiopia's crop production up 24%\n\nEthiopia ...
9,Court rejects $280bn tobacco case\n\nA US gove...


### Applying preprocessing function for our texts

In [113]:
import string
import re
def text_preprocessor(text):
    stemmer_ss = SnowballStemmer("english")       
    wnl = WordNetLemmatizer()
    text = text.replace("\n", " ")
    text = re.sub(r"[0-9]", "", text)
    text = text.lower()
    text = word_tokenize(text)
    text = [stemmer_ss.stem(word) for word in text]
    text = [wnl.lemmatize(word) for word in text]
    text = [char for char in text if char not in string.punctuation]
    text = [word for word in text if word not in stopwords.words('english')]
    text = ' '.join(text)
    
    return text
    
    

### Applying the function

In [114]:
documents_df['documents'] = documents_df['documents'].apply(text_preprocessor)

### Vocab creation for all the documents

In [115]:
sentences = documents_df['documents']
from nltk import FreqDist
vocabs = []
for sentence in sentences:
    word_dist = FreqDist()
    word_dist.update(sentence.split())
    vocabs.append(word_dist)

### Util function to get top5 articles based on search query

In [116]:
def returnTop5(sentence):
    match_list = [0]*50
    for word in sentence.split(" "):
        for i in range(len(vocabs)):
            if word in vocabs[i].keys():
                match_list[i]+=1
    top5 = np.array(match_list).argsort()[-5:]    
    
    print("These are the top 5 results based on your search query")
    print("")
    for index in range(4,-1,-1):
        print("================")
        print("Article No: {}".format(5 - index))
        print(documents[top5[index]])


### Test

In [117]:
print("Enter your search query")
search = input()
print()
returnTop5(search)

Enter your search query
"Ms Bradburn said there was a big queue at the start of the sale but it had now gone down"

These are the top 5 results based on your search query

Article No: 1
Artists' secret postcards on sale

Postcards by artists including Damien Hirst and Tracey Emin have sold just hours after the opening of the Royal Academy of Arts annual Secrets sale.

The identity of the artist remains unknown until each work is bought and the signature is revealed on the back. "There are still some big names left, such as Mario Testino," said RCA spokeswoman Sue Bradburn. All postcards are priced at £35. The sale opened at 8am on Friday and will close at 6pm on Saturday. Ms Bradburn said there was a big queue at the start of the sale but it had now gone down.

She said the people that had bought the famous name postcards had arrived early and had spent time studying each work. "They would have known what to look for." The exhibition has been open for viewing since 19 November. Film di