# Sentiment Analysis by hand and with NLTK 

# Setup

## Install dependencies

In [None]:
# Wer kennt sich mit jupyter notebooks aus?

In [None]:
import sys
!{sys.executable} -m pip install nltk
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install scipy
!{sys.executable} -m pip install sklearn
!{sys.executable} -m pip install xlrd

## Import dependencies

In [None]:
import nltk
import csv
import pandas as pd
import numpy as np

#gives us the list of punctuations
import string

# for stopword removal. stopwords are like "the", "and", "over"
from nltk.corpus import stopwords
nltk.download('stopwords')

# gives us functionality to stem a world. e.g. running -> run
from nltk.stem import PorterStemmer

# gives us functionality of a blackbox sentiment analysis analyser (classes: positive, negative, neutral)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon') #pre-trained lexicon similar to AFINN

# punctuation
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# for synonym translation
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
#import nltk.data
nltk.download('averaged_perceptron_tagger')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# calculating metrics of AFINN
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# in order to be able to get the precision/recall explanation from wikipedia
from IPython.display import Image

## Loading the dataset with user feedbacks

In [None]:
# Filepath to dataset
fpDataset = './data/customer-feedback_full_cleaned_1000.xlsx'

#Load Excel file into a DataFrame
dfData = pd.read_excel(fpDataset, sheet_name='Sheet1')
dfData_backup = dfData.copy()

In [None]:
dfData

In [None]:
# first feedback
dfData['FEEDBACK'].loc[0]

In [None]:
# filter for only positive ratings
dfData[dfData['RATING'] == 1]

## Loading the AFINN-111 Mappings

In [None]:
#loading the AFINN mapping
lol = list(csv.reader(open('data/AFINN-111.txt', 'r'), delimiter='\t')) #load afinn into list of lists
afinn = {d[0]: int(d[1]) for d in lol} #create afinn dictionary

def afinnScore(word):
    return afinn[word.lower()] if word.lower() in afinn else 0

In [None]:
afinn

## Getting the sentiment score for the Feedbacks

In [None]:
#get the afinn scores for one example
sampleSentence = dfData['FEEDBACK'].loc[990]

wordList = sampleSentence.split(' ')
wordList_scores = [afinnScore(word) for word in wordList]

print(sampleSentence)
print(wordList_scores)

In [None]:
#which words got scored?

#get all scores in a dictionary
scoredWords = dict(zip(wordList,wordList_scores))
#get only the ones with value != 0
scoredWords = {key: val for key, val in scoredWords.items() if val != 0}
print(scoredWords)

In [None]:
#The word OUTSTANDING is also in AFINN. but it has some punctuation on it. let's remove the punctuation

## Remove punctuation 

In [None]:
def removePunctuation(sentence):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in sentence if ch not in exclude)

print(removePunctuation(sampleSentence))

In [None]:
# lets apply to the whole dataset
dfData['FEEDBACK'] = dfData['FEEDBACK'].apply(removePunctuation)
sampleSentence = dfData['FEEDBACK'].loc[990]

In [None]:
#let's put the afinn score for a sentence into a function and then apply to our sample sentence again 

def getAfinnScores(sentence): 
    wordList = sentence.split(' ')
    wordList_scores = [afinnScore(word) for word in wordList] #repeating words are respected
    sentenceScore = sum(wordList_scores)
    
    scoredWords = dict(zip(wordList,wordList_scores))
    scoredWords = {key: val for key, val in scoredWords.items() if val != 0} #only get the scored words that matter
    return sentenceScore,scoredWords

In [None]:
getAfinnScores(sampleSentence)

In [None]:
#Now let's apply this function to the whole dataset and add the information to the frame
dfData['AFINN-score'] = dfData['FEEDBACK'].apply(getAfinnScores)

In [None]:
dfData

## Removal of Stopwords using NLTK

In [None]:
# now we remove stopwords that are not required in the dataset, just to clean it up
def removeStopWords(sentence):
    stopwordList = stopwords.words("english")
    wordList = [word for word in sentence.split(' ') if removePunctuation(word.lower()) not in stopwordList]
    return ' '.join(wordList)

#see example:
print(removePunctuation(sampleSentence), end='\n-----------\n')
print(removeStopWords(removePunctuation(sampleSentence)))

In [None]:
# apply it to the whole dataset
dfData['FEEDBACK'] = dfData['FEEDBACK'].apply(removeStopWords)
sampleSentence = dfData['FEEDBACK'].loc[990]

## Normalizing with Synonyms

In [None]:
# we define a function that takes a sentence and replaces each word by a canonical synonym
def replace_synonyms(sentence):
    output = ""
    # Load the pretrained neural net
    tokenized = tokenizer.tokenize(sentence)
    words = word_tokenize(sentence)
    # Identify the parts of speech
    tagged = nltk.pos_tag(words)

    for i in range(0,len(words)):
        synonyms = []
        # Only replace nouns with nouns, vowels with vowels etc.
        for syn in wn.synsets(words[i]):
            # Do not attempt to replace proper nouns or determiners
            if tagged[i][1] == 'NNP' or tagged[i][1] == 'DT':
                break

            # The tokenizer returns strings like NNP, VBP etc. but the wordnet synonyms has tags like .n.
            # So we extract the first character from NNP ie n then we check if the dictionary word has a .n. or not 
            word_type = tagged[i][1][0].lower()
            if syn.name().find("."+word_type+"."):
                r = syn.name()[0:syn.name().find(".")] # extract the word only
                synonyms.append(r)

        if len(synonyms) > 0:
            output = output + " " + synonyms[0]
        else:
            # If no replacement could be found, then just use the original word
            output = output + " " + words[i]
    return output


In [None]:
#lets see the difference
print(sampleSentence)
print(replace_synonyms(sampleSentence))

In [None]:
#apply to whole dataset
dfData['FEEDBACK'] = dfData['FEEDBACK'].apply(replace_synonyms)
sampleSentence = dfData['FEEDBACK'].loc[990]

In [None]:
# let's update the afinn score based on cleaned feedbacks
dfData['AFINN-score'] = dfData['FEEDBACK'].apply(getAfinnScores)

In [None]:
dfData

# Measuring the performance of an approach

In [None]:
# first we need to normalize AFINN scores to make them comparable to the RATING scores
def normalize_afinn(affinScore):
    if affinScore > 0: return 1
    else: return 0

# an AFINN score of 0 is neutral. this doesnt exist in the Ranking.
# Thus here we will use the "normalize_afinn" function only on non-zero values (we transform Zero values to NULL/NAN)
# later we will filter out these invalid values
def normalize_afinn_scores(dfData):
    dfData['AFINN-score-normalized'] =  np.nan
    dfScore = pd.DataFrame(list(dfData['AFINN-score']),columns=['sentence_score', 'word_scores'])
    dfData['AFINN-score-normalized'] = dfScore[dfScore['sentence_score'] != 0]
    dfData['AFINN-score-normalized'] = dfData[dfData['AFINN-score-normalized'].notnull()].loc[:,'AFINN-score-normalized'].apply(normalize_afinn)
    return dfData
    
dfData = normalize_afinn_scores(dfData)

In [None]:
dfData

In [None]:
Image(url='https://upload.wikimedia.org/wikipedia/commons/2/26/Precisionrecall.svg')
# Precision = TP / allSelected = TP / (TP+FP) 
    #-> "Of all items identified as positive, how much % are correctly identified as positive?" 
    # high precision means that an algorithm returned substantially more relevant results than irrelevant ones
    
# Recall = TP / allRelevant = TP / (TP+FN) 
    #-> "of all positive items, how much %  are correctly identified as positive?"
    # high recall means that an algorithm returned most of the relevant results.

In [None]:
def get_afinn_performance():
    # lets filter out all NULL/NAN values (where the AFINN score was neutral at Zero)
    dfScoring = dfData[dfData['AFINN-score-normalized'].notnull()]

    # we can calculate the values manually:
    true_positive =  len(dfScoring[(dfScoring['AFINN-score-normalized'] == 1) & (dfScoring['RATING'] == 1)]) # Algo identified as 1, and groundtruth is 1
    false_positive = len(dfScoring[(dfScoring['AFINN-score-normalized'] == 1) & (dfScoring['RATING'] == 0)]) # Algo identified as 1, but it is 0
    false_negative = len(dfScoring[(dfScoring['AFINN-score-normalized'] == 0) & (dfScoring['RATING'] == 1)]) # Algo didn't identify as 1, but it is 1

    precision = true_positive / (true_positive+false_positive)
    recall = true_positive / (true_positive+false_negative)
    f1_score = 2 * precision*recall / (precision+recall) # Harmonic average of the precision and recall. Range [0,1]

    return precision, recall, f1_score, dfScoring


precision, recall, f1_score, dfScoring = get_afinn_performance()

print('{} precision'.format(precision))
print('{} recall'.format(recall))
print('{} f1 score'.format(f1_score)) # Harmonic average of the precision and recall. Range [0,1]

In [None]:
# ....or use libraries for this purpose
y_test = dfScoring['RATING']
y_pred = dfScoring['AFINN-score-normalized']
print('{} precision'.format(precision_score(y_test, y_pred)))
print('{} recall'.format(recall_score(y_test, y_pred)))
#print('{} f1 score'.format(f1_score(y_test, y_pred)))

In [None]:
#Now lets compare without having cleaned the data
dfData['AFINN-score'] = dfData_backup['FEEDBACK'].apply(getAfinnScores)
#update the normalized values
dfData = normalize_afinn_scores(dfData)

precision, recall, f1_score, dfScoring = get_afinn_performance()

print('{} precision'.format(precision))
print('{} recall'.format(recall))
print('{} f1 score'.format(f1_score))

## Word stemming using NLTK's PorterStemmer

In [None]:
#Word stemming means trimming the word to its lexicographical stem.
# this way multiple derivatives of words can be categorized as one. e.g. running -> run

# In general it is good practice to stem all the words,
# as some algorithms assume that the words have been stemmed with a specific algorithm

def stemWords(sentence):
    wordList = sentence.split(' ')
    ps = PorterStemmer()
    return ' '.join([ps.stem(word) for word in wordList])

In [None]:
# this stemmer is highly rule-based rather than lexicograph. It follows the principle of:
# "the purpose of stemming is to bring variant forms of a word together, not to map a word onto its ‘paradigm’ form."

print(sampleSentence)
print(stemWords(sampleSentence))

In [None]:
#let's NOT apply to the whole dataset just yet

#dfData['FEEDBACK'] = dfData['FEEDBACK'].apply(stemWords)
#sampleSentence = dfData['FEEDBACK'].loc[990]

In [None]:
#one word of caution: there are several algorithms that assume different initial conditions.
# an example is here: the goal of stemming was to make words more generalizeable. but the AFINN mapping assumes no stemming

# for example for words in the "family" of "affect", AFINN has scores for the following:
afinn_affect = [word for word in afinn if (word.startswith('affect'))]
print([getAfinnScores(word) for word in afinn_affect], end ="\n----------\n")

# the stemmed versions are as follows:
ps_affect = [PorterStemmer().stem(word) for word in afinn_affect]
print(ps_affect, end ="\n----------\n")

#and these are the scores after the stemming
print([getAfinnScores(word) for word in ps_affect], end ="\n----------\n")

In [None]:
#the afinn score after stemming is negatively affected
print(getAfinnScores(stemWords(sampleSentence)), end ="\n-------\n")

# lThis is because the stemmer changed the word OUTSTAND to something that is not mapped in AFINN
print([PorterStemmer().stem(word) for word in sampleSentence.split(' ')]) 

## Getting scores from pre-trained black box model with NLTK 

In [None]:
sid = SentimentIntensityAnalyzer()

# prints the scores for some feedbacks
for sentence in dfData['FEEDBACK'].loc[4:5]:
    print(sid.polarity_scores(sentence))
    print(sentence, end='\n------------\n')
    

# here the vader model is already a trained model. 
# we use the model to calculate the sentiment score for the sentences in our dataset

#source: https://opensourceforu.com/2016/12/analysing-sentiments-nltk/

In [None]:
# as we do not know on which basis this blackbox algorithm works, 
# one has to test whether stemming, removing punctuations etc gives a better or worse result

#here we use the unmodified dfData_backup
for sentence in dfData_backup['FEEDBACK'].loc[4:5]:
    print(sid.polarity_scores(sentence))
    print(sentence, end='\n------------\n')

## Training own generic classifier 

In [None]:
#now we use the labels from the dataset

# here we do not have a pre-trained model. we use our own model to train a generic classifier
    
# Step 1 – Training data
#labels = ['neg','pos','neg','pos','pos']
dataset = list(zip(dfData["FEEDBACK"],dfData["RATING"]))
  
# Step 2
dictionary = set(word.lower() for passage in dataset for word in word_tokenize(passage[0]))
  
# Step 3
t = [({word: (word in word_tokenize(x[0])) for word in dictionary}, x[1]) for x in dataset]
  
# Step 4 – the classifier is trained with sample data
classifier = nltk.NaiveBayesClassifier.train(t)

In [None]:
classifier.show_most_informative_features()

In [None]:
test_data = sampleSentence
print(sampleSentence)

test_data_features = {word.lower(): (word in word_tokenize(test_data.lower())) for word in dictionary}

distribution = classifier.prob_classify(test_data_features)
for label in distribution.samples():
    print("%s: %f" % (label, distribution.prob(label)))

In [None]:
test_data_features