In [1]:
import pandas as pd
from newspaper import Article
import re
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
f = open("news articles/articles.txt","r")
LOA = [line.rstrip() for line in f]
df = pd.DataFrame(columns = ['Title','Text'])
for url in LOA: 
    news_article = Article(url, language="en")     
    news_article.download() 
    news_article.parse() 
    news_article.nlp() 
    df2={'Title':news_article.title,'Text':news_article.text}
    df=df.append(df2,ignore_index=True)

df.to_csv("news articles/parsed_news_articles.csv",encoding="utf-8")

In [3]:
def preprocessor(df):
    lemmatizer = WordNetLemmatizer() 
    stop_words = stopwords.words('english')
    text=df['Text']
    titles=df['Title']
    cleanedArticles=[]
    spcl= '[-,;@_!#$%^&*()<>?/\|}{~:''.+]'
    for i in text:
        i = i.lower()
        i = re.sub(r'[^\x00-\x7F]+',' ', i)
        i = re.sub('^a-z0-9',' ',i)
        i = re.sub(spcl,' ',i)
        i = re.sub('[0-9]',' ',i)
        i = re.sub('\n\n',' ',i)
        i = re.sub('\s\s',' ',i)
        v = i.split(' ')
        v = [word for word in v if word not in stop_words]
        v = [lemmatizer.lemmatize(word) for word in v]
        temp = ' '.join(v)
        cleanedArticles.append(temp)
    return cleanedArticles

In [4]:
cleanedArticles=preprocessor(df)

In [8]:
cleanedArticles

['since employment unemployment figure always estimated considering principal subsidiary status employment bd employment estimate based principal status incomplete misleading  representational image since employment unemployment figure always estimated considering principal subsidiary status employment bd employment estimate based principal status incomplete misleading  representational image report paper ie november  various employment estimate     based plf periodic labour force survey data received considerable attention received several query policy maker regarding difference employment estimate study laveesh bhandari amaresh dubey bd  himanshu quoted report article clarify reason difference highlight major finding study first estimate different bd based usual principal status ups employment study based usual principal subsidiary status upss employment since himanshu estimate also based upss figure much different except     plf figure himanshu estimate employment         almost neg

In [5]:
#Testing
f = open("news articles/test_articles.txt","r")
LOA = [line.rstrip() for line in f]
test_df = pd.DataFrame(columns = ['Title','Text'])
for url in LOA: 
    news_article = Article(url, language="en")     
    news_article.download() 
    news_article.parse() 
    news_article.nlp() 
    df2={'Title':news_article.title,'Text':news_article.text}
    test_df=test_df.append(df2,ignore_index=True)

test_df.to_csv("news articles/parsed_test_articles.csv",encoding="utf-8")
cleanedTestArticles=preprocessor(test_df)

In [9]:
def vectorize_sim_search(query, data):
    """
    query: `str`
    data: `list` of strings
    stop_words: `list` of stop words (default is "english" from SKlearn)
    
    Returns:
    cos_sim: first element from `cosine_similarity` array
    tfidf_vectorizer: `TfidfVectorizer object`
    data: updated `data` with `query` in position 0
    """
    #Inserting query in list position 0
    cos_sim=[]
    data = query + cleanedArticles
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(data)
    for i in range(len(query)):
        val = cosine_similarity(tfidf_matrix[i], tfidf_matrix)
        val_test = val[0:len(query)-1]
        val_train = val[len(query):len(data)]
            
        cos_sim.append(val)
    return cos_sim

In [7]:
cos_sim=vectorize_sim_search(cleanedTestArticles,cleanedArticles)
cos_sim

[array([[1.        , 0.30548439, 0.18277862, 0.17734823, 0.27672863,
         0.14220616, 0.12398679, 0.0991266 , 0.14394114, 0.12642703,
         0.12468664, 0.13314884, 0.13440954, 0.07858604, 0.15427149,
         0.03643391, 0.07003645, 0.09964714, 0.07929736, 0.0510094 ,
         0.05322503, 0.13762544, 0.08013122, 0.10760972, 0.1263274 ,
         0.0553735 , 0.09752202, 0.1395882 , 0.17087047, 0.11405449,
         0.18282743, 0.11047014, 0.17973017, 0.05138733, 0.11176524,
         0.10161186, 0.16456283, 0.13380753, 0.09478133, 0.08212599,
         0.07482601, 0.09478133, 0.10282336, 0.14226769, 0.07486797,
         0.11204784, 0.05748401, 0.14621115, 0.16238994, 0.13361442,
         0.14155236, 0.11138767, 0.14765135, 0.10485777, 0.08675983,
         0.07528824]]),
 array([[0.30548439, 1.        , 0.32015444, 0.23299514, 0.38200283,
         0.23175892, 0.15606056, 0.10215284, 0.17452393, 0.13546372,
         0.12132284, 0.17108243, 0.14104554, 0.12279883, 0.19914654,
         0