In [66]:
import pandas as pd
from newspaper import Article
import re
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [58]:
f = open("news articles/articles.txt","r")
LOA = [line.rstrip() for line in f]
df = pd.DataFrame(columns = ['Title','Text'])
for url in LOA: 
    news_article = Article(url, language="en")     
    news_article.download() 
    news_article.parse() 
    news_article.nlp() 
    df2={'Title':news_article.title,'Text':news_article.text}
    df=df.append(df2,ignore_index=True)

df.to_csv("news articles/parsed_news_articles.csv",encoding="utf-8")

In [72]:
def preprocessor(df):
    lemmatizer = WordNetLemmatizer() 
    stop_words = stopwords.words('english')
    text=df['Text']
    titles=df['Title']
    cleanedArticles=[]
    spcl= '[-,;@_!#$%^&*()<>?/\|}{~:''.+]'
    for i in text:
        i = i.lower()
        i = re.sub(r'[^\x00-\x7F]+',' ', i)
        i = re.sub('^a-z0-9',' ',i)
        i = re.sub(spcl,' ',i)
        i = re.sub('[0-9]',' ',i)
        i = re.sub('\n\n',' ',i)
        i = re.sub('\s\s',' ',i)
        v = i.split(' ')
        v = [word for word in v if word not in stop_words]
        v = [lemmatizer.lemmatize(word) for word in v]
        temp = ' '.join(v)
        cleanedArticles.append(temp)
    return cleanedArticles

In [80]:
cleanedArticles=preprocessor(df)

In [81]:
#Testing
f = open("news articles/test_articles.txt","r")
LOA = [line.rstrip() for line in f]
test_df = pd.DataFrame(columns = ['Title','Text'])
for url in LOA: 
    news_article = Article(url, language="en")     
    news_article.download() 
    news_article.parse() 
    news_article.nlp() 
    df2={'Title':news_article.title,'Text':news_article.text}
    test_df=test_df.append(df2,ignore_index=True)

test_df.to_csv("news articles/parsed_test_articles.csv",encoding="utf-8")
cleanedTestArticles=preprocessor(test_df)

In [82]:
def vectorize_sim_search(query, data):
    """
    query: `str`
    data: `list` of strings
    stop_words: `list` of stop words (default is "english" from SKlearn)
    
    Returns:
    cos_sim: first element from `cosine_similarity` array
    tfidf_vectorizer: `TfidfVectorizer object`
    data: updated `data` with `query` in position 0
    """
    #Inserting query in list position 0
    cos_sim=[]
    data = query + cleanedArticles
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(data)
    for i in range(len(query)):
        val = cosine_similarity(tfidf_matrix[i], tfidf_matrix)
        cos_sim.append(val)
    return cos_sim

In [84]:
cos_sim=vectorize_sim_search(cleanedTestArticles,cleanedArticles)
cos_sim

[array([[1.        , 0.04273455, 0.06167823]])]