In [35]:
#Import necessary packages
from sklearn.metrics import pairwise_distances
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import  NMF
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import numpy as np

#Import created pipeline class
from nlp_pipeline import nlp_pipeline

In [31]:
#Load in pickled objects
news_df = pickle.load(open('news_df','rb'))
word2vec_model = pickle.load(open('word2vec_model','rb'))
doc_vectors = pickle.load(open('doc_vectors','rb'))

In [10]:
#Create dataframe with article sentiments
news_titles = news_df['title'].to_list()

sent_score_dict = {}
analyzer = SentimentIntensityAnalyzer()
for title in news_titles:
    vs = analyzer.polarity_scores(title)
    sent_score_dict[title] = vs['compound']

sent_score_df = pd.DataFrame.from_dict(sent_score_dict, orient='index').reset_index()
sent_score_df.columns = ['article','vader_compound']

#Add general catergories
def compound_sorter(score):
    if score > 0:
        return 1
    elif score == 0:
        return 0
    elif score < 0:
        return -1

sent_score_df['sentiment'] = sent_score_df['vader_compound'].apply(compound_sorter)

In [19]:
#Function to clean/tokenize words
def clean_and_tokenize(document):
    token_list = word_tokenize(document)
    cleaned_words = []
    for word in token_list:
        low_word = re.sub('[\d\W]', '', word).lower()
        if low_word:
            cleaned_words.append(low_word)
    return cleaned_words

#Function to create document vectors
not_in_model = []

def vectorize_document(cleaned_title_words, model):
    list_of_word_vectors = []
    for token in cleaned_title_words:
        if token in model.wv.vocab:
            list_of_word_vectors.append(model[token])
        else:
            not_in_model.append(token)
    doc_vector = np.mean(list_of_word_vectors, axis=0)
    return doc_vector

In [37]:
#Recommender function

def article_opposite(input_query, model, doc_vectors):

    #Find sentiment
    analyzer = SentimentIntensityAnalyzer()
    new_sentiment = compound_sorter(analyzer.polarity_scores(input_query)['compound']) #-1 negative, +1 positive

    #Find topic
    new_topic = vectorize_document(clean_and_tokenize(input_query), model).reshape(1, -1)
    potential_article_return_list = pairwise_distances(new_topic,doc_vectors,metric='cosine').argsort()
    articles_to_return = []
    already_added_1 = False
    already_added_2 = False
    if new_sentiment == 0:
        for article in potential_article_return_list[0]:
            article_sentiment = sent_score_df.iloc[article]['sentiment']
            if (article_sentiment == 1) and (already_added_1 == False):
                articles_to_return.append(article)
                already_added_1 = True
            elif (article_sentiment == -1) and (already_added_2 == False):
                articles_to_return.append(article)
                already_added_2 = True
            elif (already_added_1 == True) and (already_added_2 == True):
                return articles_to_return
    elif new_sentiment == 1:
        for article in potential_article_return_list[0]:
            article_sentiment = sent_score_df.iloc[article]['sentiment']
            if (article_sentiment == 0) and (already_added_1 == False):
                articles_to_return.append(article)
                already_added_1 = True
            elif (article_sentiment == -1) and (already_added_2 == False):
                articles_to_return.append(article)
                already_added_2 = True
            elif (already_added_1 == True) and (already_added_2 == True):
                return articles_to_return
    elif new_sentiment == -1:
        for article in potential_article_return_list[0]:
            article_sentiment = sent_score_df.iloc[article]['sentiment']
            if (article_sentiment == 0) and (already_added_1 == False):
                articles_to_return.append(article)
                already_added_1 = True
            elif (article_sentiment == 1) and (already_added_2 == False):
                articles_to_return.append(article)
                already_added_2 = True
            elif (already_added_1 == True) and (already_added_2 == True):
                return articles_to_return

In [41]:
#Test run

recommended_articles = article_opposite('Wives of alleged Haiti assassins left in the dark, desperate for word or to repatriate bodies',word2vec_model, doc_vectors)
article_1 = sent_score_df.iloc[recommended_articles[0]]['article']
article_2 = sent_score_df.iloc[recommended_articles[1]]['article']
print('Article 1: {art1}'.format(art1 = article_1))
print('Article 2: {art2}'.format(art2 = article_2))

Article 1: How supply-chain innovation can bolster U.S. security
Article 2: U.S. sending FBI, DHS law enforcement to Haiti


  if token in model.wv.vocab:


In [31]:
#Pickle objects for easier transition to web app

pickle.dump(sent_score_df, open('sent_score_df','wb')) 