In [91]:
from gensim.models import KeyedVectors
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import util

import nltk
import numpy as np
import math

In [None]:
WORDVEC_PATH = r'C:\Users\Barry Li\Documents\nlp\project\vec\glove.6B.100d.w2v.txt'

model = KeyedVectors.load_word2vec_format(WORDVEC_PATH, binary=False)

In [None]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

In [138]:
test = 'Wow. This is definitely not the usual North Korea bluffing. Consider me cautiously optimistic.'

def neg_tokenize(text):
    tokens = util.mark_negation(nltk.word_tokenize(text))
    return tokens

In [137]:
positive_keywords = ['hopeful', 'sincere', 'calm', 'attitude']
model.most_similar(positive=positive_keywords)

[('confident', 0.7792400121688843),
 ('optimistic', 0.7275913953781128),
 ('cautious', 0.7191028594970703),
 ('hope', 0.7072612047195435),
 ('satisfied', 0.6978720426559448),
 ('feeling', 0.6955770254135132),
 ('good', 0.6901206970214844),
 ('feel', 0.6896106004714966),
 ('pleased', 0.6865102648735046),
 ('happy', 0.6839694976806641)]

In [136]:
negative_keywords = ['scared', 'anxious', 'cautious', 'stressful', 'alert', 'emotion', 'attitude']
negative_keywords = ['cautious', 'untrusting', 'fearful', 'attitude']
model.most_similar(positive=negative_keywords)

[('wary', 0.76902174949646),
 ('anxious', 0.7585034370422363),
 ('hesitant', 0.7413669228553772),
 ('timid', 0.7065054774284363),
 ('impatient', 0.7036861181259155),
 ('ambivalent', 0.6842986941337585),
 ('skeptical', 0.6806875467300415),
 ('apprehensive', 0.679796576499939),
 ('skittish', 0.6772457957267761),
 ('pessimistic', 0.6748073101043701)]

In [145]:
pos_200 = model.most_similar(positive=positive_keywords, topn=200)
neg_200 = model.most_similar(positive=negative_keywords, topn=200)

uni_features = {}
for word, sim in pos_200:
    score = uni_features.get(word, 0)
    uni_features[word] = score + sim
for word, sim in neg_200:
    score = uni_features.get(word, 0)
    if score > sim:
        uni_features[word] = score
    else:
        uni_features[word] = -1 * sim

for stopword in ENGLISH_STOP_WORDS:
    uni_features.pop(stopword, None)
print('pos', len([v for k, v in uni_features.items() if v > 0]))
print('neg', len([v for k, v in uni_features.items() if v < 0]))

pos 161
neg 159


In [146]:
class SentAnalyzer():
    
    def __init__(self, features):
        neg_features = self.duplicate_negation(features)
        neg_features.update(features)
        self.features = neg_features
        idxs = {feat: i for i, feat in enumerate(self.features)}
        self.idxs = idxs
        self.vectorizer = CountVectorizer(vocabulary=idxs, tokenizer=neg_tokenize)
        flipped_idxs = {i: self.features[feat] for feat, i in idxs.items()}
        self.weights = np.array([flipped_idxs[i] for i in range(len(flipped_idxs))])
    
    def duplicate_negation(self, features):
        neg_features = {
            '{}_NEG'.format(feat): -1 * score for feat, score in features.items()
        }
        return neg_features
    
    def normalize(self, score, alpha=15):
        """Normalizing function as described by VADER Sentiment analyzer"""
        return score / np.sqrt((score * score) + alpha)
    
    def polarize(self, passages):
        """Calculate the polarity of a passage.
        
        passage [str]: a list of strings representing tokens
        """
        bow_vec = self.vectorizer.transform(passages)
        return bow_vec.dot(self.weights)
    
    def predict(self, passage):
        return self.polarize(passage)
    
SA = SentAnalyzer(uni_features)
test2 = "I'm still really skeptical myself. This is all just too good to be true. \nWhat the fuck is Kim actually planning? How the fuck is North Korea going to unify with South Korea when NK has the most fucked up and brainwashed population in existence?"
test3 = "I'm really optimistic. I hope this succeeds."
test4 = "Is it really happening? There is no obvious reason to believe him and this is all happening so fast? Could something amazing actually be happening here?"
test5 = "Got the warning on my phone. I was just talking to friends on discord then heard the weird note from my phone that I hear for the first time. I checked it says missile. I was terrified checking every news source and calling friends. Then got another saying it passed over. That was terrifying."
test6 = "I've been living in northern Japan for some time and have never seen this kind of response to a missile launch before. The alerts, the news broadcasts, it was not taken lightly. Regardless of where the missile ended up, I think the message it sent is one that will have some lasting implications on how we perceive North Korea's aggression. It's only a matter of time before they end their war with Poseidon and make the mistake of targeting actual human lives. Stay safe out there, everyone."
test7 = "Their targeting systems and missiles are far from foolproof. A missile failure could have dropped this missile on an inhabited area in Japan. That would likely have provoked a full scale war. The level of irresponsibility here is just staggering."
test8 = "Man, that must be scary as shit. The other day I just arrived in Brussels and sat down in the Grand Place (a big open square, right in the center of Brussels) and then got a BBC alert 'Machete-wielding man attacks soldiers in central Brussels'. My heart skipped a beat. It was just a couple of streets away. I felt kinda sick. I did hear police sirens earlier but didn't think much of it. There were also soldiers and police all over the place."
print(SA.predict([test, test2, test3, test4, test5, test6, test7, test8]))

[ 0.79297072  1.82907706  2.68077487 -1.13360691  0.59415591 -0.01073837
  0.         -0.00861579]
