### Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package punkt to /home/ananthan2k/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ananthan2k/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ananthan2k/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Read from file

In [8]:
files = os.listdir('./data')
files.sort()


In [9]:
files = files[0:5]
print(files)

['001.txt', '002.txt', '003.txt', '004.txt', '005.txt']


In [10]:
# store each of the text in an array
doc = []

for file in files:
    with open('./data/{}'.format(file)) as fd:
        # each line in the file is read
        lines = fd.readlines()
        # each line in the file is append to doc_lines
        doc_lines = ""
        for line in lines:
            doc_lines+=line
        doc.append(doc_lines)

In [11]:
docs = np.array(doc)
print(docs)

['Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.\n\nTime Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to si

In [15]:
docs_df = pd.DataFrame({"texts":docs})
docs_df["texts"]

0    Ad sales boost Time Warner profit\n\nQuarterly...
1    Dollar gains on Greenspan speech\n\nThe dollar...
2    Yukos unit buyer faces loan claim\n\nThe owner...
3    High fuel prices hit BA's profits\n\nBritish A...
4    Pernod takeover talk lifts Domecq\n\nShares in...
Name: texts, dtype: object

In [16]:
import string
import re

def preProcessing(text):
    stemm_snow = SnowballStemmer("english")
    lem = WordNetLemmatizer()
    text = text.replace("\n", " ")
    text = re.sub(r"[0-9]", "", text)
    text = text.lower()
    text = word_tokenize(text)
    text = [stemm_snow.stem(word) for word in text]
    text = [lem.lemmatize(word) for word in text]
    text = [char for char in text if char not in string.punctuation]
    text = [word for word in text if word not in stopwords.words('english')]
    text = ' '.join(text)
    
    return text

In [17]:
docs_df['texts'] = docs_df['texts'].apply(preProcessing)

In [18]:
sentences = docs_df['texts']
from nltk.probability import FreqDist
vocabs = []
for sentence in sentences:
    word_dist = FreqDist()
    # update with news words frequency on each iteration
    word_dist.update(sentence.split())
    vocabs.append(word_dist)

In [26]:
print(vocabs[1].keys())

dict_keys(['dollar', 'gain', 'greenspan', 'speech', 'ha', 'hit', 'highest', 'level', 'euro', 'almost', 'three', 'month', 'feder', 'reserv', 'head', 'said', 'u', 'trade', 'deficit', 'set', 'stabilis', 'alan', 'highlight', 'govern', "'s", 'willing', 'curb', 'spend', 'rise', 'household', 'save', 'factor', 'may', 'help', 'reduc', 'late', 'new', 'york', 'reach', 'thursday', 'market', 'concern', 'greenback', 'recent', 'friday', 'chairman', 'mr', 'london', 'ahead', 'meet', 'g', 'financ', 'minist', 'sent', 'higher', 'earlier', 'tumbl', 'back', 'worse-than-expect', 'job', 'data', '``', 'think', 'take', 'much', 'sanguin', 'view', 'current', 'account', 'taken', 'time', "''", 'robert', 'sinch', 'currenc', 'strategi', 'bank', 'america', 'longer-term', 'lay', 'condit', 'improv', 'year', 'next', 'worri', 'china', 'howev', 'remain', 'peg', 'sharp', 'fall', 'therefor', 'made', 'chine', 'export', 'price', 'high', 'competit', 'call', 'shift', 'beij', 'polici', 'fallen', 'deaf', 'ear', 'despit', 'comment'

In [33]:
def Top5(sentence):
    match_list = [0]*5
    for word in sentence.split(" "):
        for i in range(len(vocabs)):
            if word in vocabs[i].keys():
                match_list[i]+=1
    print(match_list)
    top5 = np.array(match_list).argsort()[::-1]    
    print(top5)
    print("These are the top 5 results based on your search query")
    print("")
    for index in range(4,-1,-1):
        print("================")
        print("Article No: {}".format(5 - index))
        print(docs[top5[index]])

In [34]:
print("Enter your search query")
search = input()
print()
Top5(search)

Enter your search query
What is the rate of job

[0, 2, 0, 1, 0]
[1 3 4 2 0]
These are the top 5 results based on your search query

Article No: 1
Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet adve