## Sentence reformulation

In [1]:
import nltk
from gensim.models import KeyedVectors

Download FastText pretrained vectors for English: 
[cc.en300.vec.gz](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz)

And download Yelp! dataset composed of reviews: 
[Yelp.train.text](https://drive.google.com/file/d/1TAcfL091lKb2LipaUELFteZqJjQu-gMa/view?usp=sharing)

In [2]:
# !wget -nc https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz 
# !gunzip -k cc.en.300.vec.gz

Load downloaded pretrained FastText vectors by gensim library:

In [3]:
#your code here"
fast = KeyedVectors.load_word2vec_format('cc.en.300.vec')

Compute similarity of two words using gensim

In [4]:
#We discussed different words, look and similarity of 'king' and 'queen' for example. Could you put it inot context?

#your code here
print(f'The similiarity between the vectors for the words "king" and "queen" is {fast.similarity("king", "queen")}.')
print(f'The distance between "king" and "queen" is {fast.distance("king", "queen")}')
print(fast.distance("king", "queen") + fast.similarity("king", "queen"))
print(f'The similiarity between the vectors for the words "king" and "man" is {fast.similarity("king", "man")}.')
print(f'The distance between "king" and "man" is {fast.distance("king", "man")}')
print(fast.distance("king", "man") + fast.similarity("king", "man"))
print(f'The similiarity between the vectors for the words "king" and "woman" is {fast.similarity("king", "woman")}.')
print(f'The distance between "king" and "woman" is {fast.distance("king", "woman")}')
print(fast.distance("king", "woman") + fast.similarity("king", "woman"))
print(f'The similiarity between the vectors for the words "queen" and "woman" is {fast.similarity("queen", "woman")}.')
print(f'The distance between "queen" and "woman" is {fast.distance("queen", "woman")}')
print(fast.distance("queen", "woman") + fast.similarity("queen", "woman"))
print(f'The similiarity between the vectors for the words "queen" and "man" is {fast.similarity("queen", "man")}.')
print(f'The distance between "queen" and "man" is {fast.distance("queen", "man")}')
print(fast.distance("queen", "man") + fast.similarity("queen", "man"))
print(f'The similiarity between the vectors for the words "woman" and "man" is {fast.similarity("woman", "man")}.')
print(f'The distance between "woman" and "man" is {fast.distance("woman", "man")}')
print(fast.distance("woman", "man") + fast.similarity("woman", "man"))

The similiarity between the vectors for the words "king" and "queen" is 0.7069182991981506.
The distance between "king" and "queen" is 0.29308170080184937
1.0
The similiarity between the vectors for the words "king" and "man" is 0.341844379901886.
The distance between "king" and "man" is 0.658155620098114
1.0
The similiarity between the vectors for the words "king" and "woman" is 0.22299695014953613.
The distance between "king" and "woman" is 0.7770030498504639
1.0
The similiarity between the vectors for the words "queen" and "woman" is 0.3607560992240906.
The distance between "queen" and "woman" is 0.6392439007759094
1.0
The similiarity between the vectors for the words "queen" and "man" is 0.22957992553710938.
The distance between "queen" and "man" is 0.7704200744628906
1.0
The similiarity between the vectors for the words "woman" and "man" is 0.7658364176750183.
The distance between "woman" and "man" is 0.2341635823249817
1.0


We can see that the similarity between the word vectors for "queen" and "king" is very high, approximately 70%.
The distance between the words is approximately 0.29, and in fact the distance between any two words = (1 - similarity)

We can also see that the distance between "queen"-"woman" is almost the same as the distance between "king"-"man", and the same holds for the distance between "queen"-"man" and "king"-"woman"., while the distance between "man"-"woman" is almost the same as "queen"-"king"

Sentence tokenization. Split Yelp! texts into separate tokens (words and punctuation marks) by space

In [5]:
#your code here
with open("Yelp.train.text") as text:
    yelp_data = text.read()
    yelp_space_split = yelp_data.split(' ')

Try part of speech tagging using [NLTK POS-tagger](https://www.nltk.org/book/ch05.html).
The function returns list of tuples (word, POS_tag)

In [6]:
#your code here
yelp_tokens = nltk.word_tokenize(yelp_data)

In [7]:
pos_yelp_space = nltk.pos_tag(yelp_space_split)
# pos_yelp_tokens = nltk.pos_tag(yelp_tokens[:1000])

Can you find the most similar word to the given? Can you write a method that returns a list of tuples (word, similarity) in order of decreasing similarity?

In [8]:
#your code here
def find_most_similar(word, n=10):
    results = fast.similar_by_word(word, topn=n)
    return results  

Let's do the simplest reformulation task. We just want to reformulate some sentences replacing an ajective with a similar one

In [9]:
def reformulate_sentence(sentence):
    # Sentence tokenization
    tokenized_sentence = nltk.word_tokenize(sentence)

    # Part of speech tagging
    POS_tagged_words = nltk.pos_tag(tokenized_sentence)

    reformulated_sentence_words = []
    for word, pos_tag in POS_tagged_words:
        # If the word is adjective...
        if pos_tag in ['JJR', 'JJS', 'JJ']:
            try:
                # ...look for the word most similar to the given and replace it
                # your code here
                sim_words = find_most_similar(word, n=10)
                print(sim_words)
                for w in sim_words:
                    if w[0].lower() in word.lower():
                        continue
                    else:
                        word = w[0]
                        break
                reformulated_sentence_words.append(word)
            except:
                print(f'There is no {word} word in FastText dictionary! ...')
        else:
            reformulated_sentence_words.append(word)
    # Join words list in a sentence
    return ' '.join(reformulated_sentence_words)

## Sentiment analysis

In [10]:
import random

In [11]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Adam/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


VADER sentiment classifier from NLTK library. The range of sentiment is from -1 to 1 where -1 is negative, 0 is neutral and 1 is positive

Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.

In [12]:
sentiment_analyzer = SentimentIntensityAnalyzer()

Read the dataset text file line by line and put lines into the list

In [13]:
#your code here
with open("Yelp.train.text") as text:
    yelp_lines = text.readlines()

Read Yelp dataset from text file and get 1000 random sentences

In [14]:
#your code here
yelp_sample = random.choices(yelp_lines, k=1000)

Compute average sentiment of 1000 sentences sentences set by VADER sentiment classifier

In [15]:
avg_sentiment = np.mean([sentiment_analyzer.polarity_scores(line)['compound'] for line in yelp_sample])

<IPython.core.display.Javascript object>

Reformulate sentences and compute average sentiment again. Try to come up with ways to make senteces more positive on average. What about more negative? Can you come up with some interesting experiment on this data with POS-tagged reformulations?

In [16]:
def reformulate_sentence_mode(sentence, mode ='pos'):
    # Sentence tokenization
    tokenized_sentence = nltk.word_tokenize(sentence)

    # Part of speech tagging
    POS_tagged_words = nltk.pos_tag(tokenized_sentence)

    reformulated_sentence_words = []
    for word, pos_tag in POS_tagged_words:
        # If the word is adjective...
        if pos_tag in ['JJR', 'JJS', 'JJ']:
            try:
                # ...look for the word most similar to the given and replace it
                # your code here
                sim_words = find_most_similar(word, n=10)
                max_word = 0
                for sim_word,sim in sim_words:
                    if sim_word.lower() in word.lower():
                        continue
                    analyz = sentiment_analyzer.polarity_scores(sim_word)
                    scores = sim * analyz['compound']
                    if (mode == 'pos' and scores  > max_word) or (mode == 'neg' and scores  < max_word):
                        max_word = scores 
                        word = sim_word
                            
                reformulated_sentence_words.append(word)
            except:
                print(f'There is no {word} word in FastText dictionary! ...')
        else:
            reformulated_sentence_words.append(word)
    # Join words list in a sentence
    return ' '.join(reformulated_sentence_words)

In [17]:
import time

In [18]:
now = time.time()
sentence = 'i like happy dags'
print('ROW : ', sentence)
print(sentiment_analyzer.polarity_scores(sentence))
pos_sentence = reformulate_sentence_mode(sentence, mode='pos')
print('Positive : ', pos_sentence)
print(sentiment_analyzer.polarity_scores(pos_sentence))
neg_sentence = reformulate_sentence_mode(sentence, mode='neg')
print('Positive : ', neg_sentence)
print(sentiment_analyzer.polarity_scores(neg_sentence))
print(time.time() - now)

ROW :  i like happy dags
{'neg': 0.0, 'neu': 0.139, 'pos': 0.861, 'compound': 0.7351}
Positive :  i like happier dags
{'neg': 0.0, 'neu': 0.145, 'pos': 0.855, 'compound': 0.7096}
Positive :  i like unhappy dags
{'neg': 0.444, 'neu': 0.159, 'pos': 0.397, 'compound': -0.0772}
714.8599119186401


In [19]:
from tqdm import tqdm

In [None]:
sum_compound = 0
avg = 0
pos = 0
neg = 0
n = len(yelp_sample)
for sentence in tqdm(yelp_sample):
    avg += sentiment_analyzer.polarity_scores(sentence)['compound']
    pos += sentiment_analyzer.polarity_scores(reformulate_sentence_mode(sentence, mode='pos'))['compound']
    neg += sentiment_analyzer.polarity_scores(reformulate_sentence_mode(sentence,mode='neg'))['compound']
avg/=n
pos/=n
neg/=n
print('Pos : ',pos)
print('Avg : ',avg)
print('Neg : ',neg)

In [None]:
def reformulate_sentence_all_option(sentence, word_similar=0.7):
    # Sentence tokenization
    tokenized_sentence = nltk.word_tokenize(sentence)

    # Part of speech tagging
    POS_tagged_words = nltk.pos_tag(tokenized_sentence)

    reformulated_sentence_words = []
    for word, pos_tag in POS_tagged_words:
        # If the word is adjective...
        words = []
        words.append(word)
        if pos_tag in ['JJR', 'JJS', 'JJ']:
            try:
                # ...look for the word most similar to the given and replace it
                # your code here
                sim_words = find_most_similar(word, n=10)
                for sim_word,sim in sim_words:
                    if sim >word_similar:
                        words.append(sim_word)
            except:
                print(f'There is no {word} word in FastText dictionary! ...')
        reformulated_sentence_words.append(words)

    return reformulated_sentence_words

def rate(x , i=0):
    if len(x) == i:
        return []
    ret_s = []
    ends = rate(x , i+1)
    if len(ends) == 0 :
        return x[i]
    for word in x[i]:
        for end in ends:
            s = word +' ' + end
            ret_s.append(s)
    return ret_s

def pos_neg_sentiment(sentence):
    x = reformulate_sentence_all_option(sentence)
    score = sentiment_analyzer.polarity_scores(sentence)['compound']
    ret_dict = {'pos': (score,sentence) , 'neg':(score,sentence),'normal':(score,sentence)  }
    for s in rate(x):
        analyz = sentiment_analyzer.polarity_scores(s)['compound']
        if (analyz > 0) and (ret_dict['pos'][0]<analyz):
            ret_dict['pos'] = (analyz,s)
        if (analyz < 0) and (ret_dict['neg'][0]>analyz):
            ret_dict['neg'] = (analyz,s)
    return ret_dict

In [None]:
sum_compound = 0
avg = 0
pos = 0
neg = 0
for sentence in tqdm(yelp_sample):
    scores = pos_neg_sentiment(sentence)
    avg += scores['normal'][0]
    pos += scores['pos'][0]
    neg += scores['neg'][0]
avg/=(len(res))
pos/=(len(res))
neg/=(len(res))
print('Pos : ',pos)
print('Avg : ',avg)
print('Neg : ',neg)