In [1]:
import pandas as pd
from newspaper import Article
import re
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from collections import Counter,OrderedDict
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [30]:
f = open("news articles/articles.txt","r")
LOA = [line.rstrip() for line in f]
df = pd.DataFrame(columns = ['Title','Text','URL','Cleaned_text'])
for url in LOA: 
    try:
        news_article = Article(url, language="en")     
        news_article.download() 
        news_article.parse() 
        news_article.nlp() 
        df2={'Title':news_article.title,'Text':news_article.text,'URL':url}
        df=df.append(df2,ignore_index=True)
    
    except Exception:
        print(url)
        
        
df.to_csv("news articles/parsed_news_articles.csv",encoding="utf-8")

https://www.ndtv.com/india-news/unemployment-rate-highest-in-45-years-reveals-stalled-report-on-jobs-10-points-1985931
https://www.ndtv.com/business/13-million-jobs-created-in-2017-member-of-economic-advisory-council-1882942




In [31]:
def preprocessor(df):
    lemmatizer = WordNetLemmatizer() 
    stop_words = stopwords.words('english')
    text=df['Text']
    titles=df['Title']
    cleanedArticles=[]
    count=0
    spcl= '[-,;@_!#$%^&*()<>?/\|}{~:''.+]'
    for i in text:
        i = i.lower()
        i = re.sub(r'[^\x00-\x7F]+',' ', i)
        i = re.sub('^a-z0-9',' ',i)
        i = re.sub(spcl,' ',i)
        i = re.sub('[0-9]',' ',i)
        i = re.sub('\n\n',' ',i)
        i = re.sub('\s\s',' ',i)
        v = i.split(' ')
        v = [word for word in v if word not in stop_words]
        v = [lemmatizer.lemmatize(word) for word in v]
        temp = ' '.join(v)
        df['Cleaned_text'][count]=temp
        cleanedArticles.append(temp)
        count = count + 1
    return cleanedArticles

In [32]:
cleanedArticles=preprocessor(df)

In [44]:
print(len(cleanedArticles))

713


In [43]:
print (df['Cleaned_text'])

0      since employment unemployment figure always es...
1      suggested india unemployment woe solved boosti...
2      deeper level acceptance self exploitation form...
3      much labour force shifting agriculture        ...
4      amazon invest usd  bn digitising indian smbs j...
5      modi government assumed problem crystallised t...
6      know nature incidence unemployment example dif...
7      investment measured gross fixed capital format...
8      state wise breakup year age group showed femal...
9       slow growth scarcity non farm job rising open...
10     unemployment rate india rose   percent februar...
11     said limit big industry offer job well service...
12     came first editorial sena mouthpiece saamana p...
13     industry sick youth looking govt job industry ...
14     working class may share concern even vote acco...
15     decline actual number occurred first time sinc...
16     prime minister narendra modi attends first day...
17     clear charge policy chan

In [35]:
#Testing
f = open("news articles/test_articles.txt","r")
LOA = [line.rstrip() for line in f]
test_df = pd.DataFrame(columns = ['Title','Text','URL','Cleaned_text'])
for url in LOA: 
    news_article = Article(url, language="en")     
    news_article.download() 
    news_article.parse() 
    news_article.nlp() 
    df2={'Title':news_article.title,'Text':news_article.text}
    test_df=test_df.append(df2,ignore_index=True)

test_df.to_csv("news articles/parsed_test_articles.csv",encoding="utf-8")
cleanedTestArticles=preprocessor(test_df)

In [63]:
def vectorize_sim_search(query, data):
    #Inserting query in list position 0
    cos_sim=[]
    #data = query + cleanedArticles
    tfidf_vectorizer = TfidfVectorizer()
    dataMatrixAll = []
  
    for i in range(len(query)):
        query_i = query[i]
        print("MAIN ARTICLE:\n",query_i[0:len(query_i)//20],"....","\n\n\n")
        print("RELATED ARTICLES:\n")
        data = [query[i]]+cleanedArticles
        dataMatrix = {}
        tfidf_matrix = tfidf_vectorizer.fit_transform(data)
        vals = cosine_similarity(tfidf_matrix[i], tfidf_matrix)[0]
        
        for j in range(len(data)-1):
            dataMatrix[df['URL'][j]]=vals[j+1]
        key_list = list(dataMatrix.keys()) 
        val_list = list(dataMatrix.values()) 
        sorted_vals = sorted(vals,reverse=True)[1:5]
        for i in sorted_vals:
            temp=key_list[val_list.index(i)]
            print(temp,"....","\tsimilarity score:",i,"\n")
        print("-------------------------------------------------------------------------------------------")

In [37]:
print(cleanedTestArticles)

['nsso\'s annual household survey     first demonetisation   highlight unemployment rate       highest  year reveals survey report released despite statistical body go ahead allege ex member survey centre row quitting  member statistical body  point cheat sheet big story according report made public accessed business standard unemployment rate highest level since     report say     unemployment rate stood   per cent youth unemployment "astronomically high" level   per cent joblessness higher urban area   per cent rural area   per cent  people withdrawing workforce labour force participation rate lower previous year nsso\'s annual household survey     first november   demonetisation prime minister narendra modi declared overnight ban high value note report core controversy quitting two member national statistical commission including acting chairman pc mohanan pc mohanan confirmed one reason quit delay release national sample survey office\'s periodic labour force survey report pc mohan

In [64]:
vectorize_sim_search(cleanedTestArticles,cleanedArticles)

MAIN ARTICLE:
 nsso's annual household survey     first demonetisation   highlight unemployment rate   .... 



RELATED ARTICLES:

http://www.jantakareporter.com/india/unemployment-rate-in-india-highest-for-45-years-reason-why-modi-government-refused-to-release-nsso-report/229882/ .... 	similarity score: 0.4853332211066227 

https://www.thestatesman.com/india/nsso-job-report-high-unemployment-rate-not-final-niti-aayog-1502729133.html .... 	similarity score: 0.44440793210561436 

https://scroll.in/latest/925501/centre-releases-report-that-showed-unemployment-rose-to-45-year-high-of-6-1-in-2017-18 .... 	similarity score: 0.41614316831116593 

https://www.deccanchronicle.com/business/in-other-news/310119/unemployment-rate-highest-in-45-years-shows-centres-buried.html .... 	similarity score: 0.40584870713619015 

-------------------------------------------------------------------------------------------
