In [1]:
#Import necessary packages
from sklearn.metrics import pairwise_distances
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import  NMF
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer

#Import created pipeline class
from nlp_pipeline import nlp_pipeline

In [26]:
#Load in pickled objects

full_news_docs = pickle.load(open('full_news_docs','rb'))
topic_model = pickle.load(open('topic_model','rb'))

In [27]:
#Create dataframe with article sentiments

sent_score_dict = {}
analyzer = SentimentIntensityAnalyzer()
for title in full_news_docs:
    vs = analyzer.polarity_scores(title)
    sent_score_dict[title] = vs['compound']

sent_score_df = pd.DataFrame.from_dict(sent_score_dict, orient='index').reset_index()
sent_score_df.columns = ['article','vader_compound']

#Add general catergories
def compound_sorter(score):
    if score > 0:
        return 1
    elif score == 0:
        return 0
    elif score < 0:
        return -1

sent_score_df['sentiment'] = sent_score_df['vader_compound'].apply(compound_sorter)

In [28]:
#Recommender function

def article_opposite(input_query):

    #Find sentiment
    analyzer = SentimentIntensityAnalyzer()
    new_sentiment = compound_sorter(analyzer.polarity_scores(input_query)['compound']) #-1 negative, +1 positive

    #Find topic
    new_topic = topic_model.transform_new(input_query)
    potential_article_return_list = pairwise_distances(new_topic,topic_model.topics,metric='cosine').argsort()
    articles_to_return = []
    already_added_1 = False
    already_added_2 = False
    if new_sentiment == 0:
        for article in potential_article_return_list[0]:
            article_sentiment = sent_score_df.iloc[article]['sentiment']
            if (article_sentiment == 1) and (already_added_1 == False):
                articles_to_return.append(article)
                already_added_1 = True
            elif (article_sentiment == -1) and (already_added_2 == False):
                articles_to_return.append(article)
                already_added_2 = True
            elif (already_added_1 == True) and (already_added_2 == True):
                return articles_to_return
    elif new_sentiment == 1:
        for article in potential_article_return_list[0]:
            article_sentiment = sent_score_df.iloc[article]['sentiment']
            if (article_sentiment == 0) and (already_added_1 == False):
                articles_to_return.append(article)
                already_added_1 = True
            elif (article_sentiment == -1) and (already_added_2 == False):
                articles_to_return.append(article)
                already_added_2 = True
            elif (already_added_1 == True) and (already_added_2 == True):
                return articles_to_return
    elif new_sentiment == -1:
        for article in potential_article_return_list[0]:
            article_sentiment = sent_score_df.iloc[article]['sentiment']
            if (article_sentiment == 0) and (already_added_1 == False):
                articles_to_return.append(article)
                already_added_1 = True
            elif (article_sentiment == 1) and (already_added_2 == False):
                articles_to_return.append(article)
                already_added_2 = True
            elif (already_added_1 == True) and (already_added_2 == True):
                return articles_to_return

In [30]:
#Test run

recommended_articles = article_opposite(['vaccines'])
article_1 = sent_score_df.iloc[recommended_articles[0]]['article']
article_2 = sent_score_df.iloc[recommended_articles[1]]['article']
print('Article 1: {art1}'.format(art1 = article_1))
print('Article 2: {art2}'.format(art2 = article_2))

[ 9129 56185 56186 ... 40576 40665     0]
60438
Article 1: A COVID-19 Hero
Article 2: Graphic Australian Covid vaccine advertisement sparks outrage


In [31]:
#Pickle objects for easier transition to web app

pickle.dump(sent_score_df, open('sent_score_df','wb')) 