## Lexicon-based approach

In [1]:
import json
import pandas as pd
from nltk import word_tokenize
import string
import numpy as np
from nltk.corpus import sentiwordnet as swn
import nltk
from tqdm import tqdm
from sklearn.metrics import classification_report

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/Bart/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Load data

In [3]:
# get list of all paths to the json-files of english episodes given subset number (bart: 0 , juno: 1, joris: 2)

def get_paths_for_en_episodes(subset_number):
    """
    Function returns list of all paths to the json-files of english 
    episodes given subset number (bart: 0 , juno: 1, joris: 2)
    
    """

    path1 = 'podcast_data_no_audio/podcasts-transcripts/' + str(subset_number)

    folders = listdir(path1)

    if '.DS_Store' in folders:
        folders.remove('.DS_Store')

    podcast_episodes_paths = []

    for letter_or_number in tqdm(folders):    
        path2 = path1 + '/' + letter_or_number


        for show_uri in listdir(path2):
            path3 = path2 + '/' + show_uri

            # select english shows only
            show_metadata = metadata_df.loc[metadata_df['show_filename_prefix'] == show_uri]

            if len(show_metadata['language'].unique()) > 0:
                if 'en' in show_metadata['language'].unique()[0]:
                    for episode_uri in listdir(path3):
                        path4 = path3 + '/' + episode_uri

                        if '.json' in path4:
                            podcast_episodes_paths.append(path4)

                
        
    return len(podcast_episodes_paths), podcast_episodes_paths

## NLTK SentiWordNet

In [4]:
def SentiWordNet_sentiment(sentence, binary = True):
    """
    Returns sentiment score for a podcast utterance with tagged tokens 
    using SentiWordNet
    """
    
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = nltk.word_tokenize(sentence)
    
    
    tagged_tokens = nltk.pos_tag(tokens)
    
    tokens_sentiment_scores = []
    for token in tagged_tokens:
        tag = ''
        lemma = lemmatizer.lemmatize(token[0])
        if token[1].startswith('N'):
            tag = 'n'
        elif token[1].startswith('J'):
            tag = 'a'
        elif token[1].startswith('V'):
            tag = 'v'
        elif token[1].startswith('R'):
            tag = 'r'
        if tag != '':
            synonyms = list(swn.senti_synsets(lemma, tag)) 
            token_sentiment = 0
            if len(synonyms) > 0:
                for synonym in synonyms:
                    token_sentiment += synonym.pos_score() - synonym.neg_score()
                tokens_sentiment_scores.append(token_sentiment/len(synonyms))      
                
       
    if binary == False:
    
        if tokens_sentiment_scores != []:
            
            sentiment_score = sum(tokens_sentiment_scores)/len(tokens_sentiment_scores) 

            if sentiment_score >= 0.001:
                return 1
            elif sentiment_score < -0.001:
                return -1
            else:
                return 0

        else:   
            return 0
        
    elif binary == True:
        if tokens_sentiment_scores != []:
            sentiment_score = sum(tokens_sentiment_scores)/len(tokens_sentiment_scores) 

            if sentiment_score >= 0:
                return 1
            elif sentiment_score < 0:
                return 0

        else:   
            return 1
        

### Binary Validation

In [14]:
val_df = pd.read_csv('labeled_datasets/binary/binary_val.csv', sep='\t')
val_df.head(5)

Unnamed: 0,text,sentiment_score
0,Just so people can understand what he just sai...,1.0
1,"Yeah, I mean small businesses tough enough. So...",1.0
2,"I think Jughead needs to go, you know, it need...",0.0
3,I don't know it was like I hated it when I was...,1.0
4,"Yeah, so and that is that is also really based...",0.0


In [15]:
target_labels = val_df['sentiment_score'].values
predicted_labels = []
for sample in tqdm(val_df['text']):
    predicted_sentiment = SentiWordNet_sentiment(sample, binary=True)
    predicted_labels.append(predicted_sentiment)

predicted_labels = np.array(predicted_labels)  
print(classification_report(target_labels, predicted_labels))

100%|██████████| 1292/1292 [00:27<00:00, 46.16it/s]

              precision    recall  f1-score   support

         0.0       0.48      0.42      0.45       405
         1.0       0.75      0.80      0.77       887

    accuracy                           0.68      1292
   macro avg       0.62      0.61      0.61      1292
weighted avg       0.67      0.68      0.67      1292






### Non-binary validation

In [7]:
nb_val_df = pd.read_csv('labeled_datasets/nonbinary/nonbinary_val.csv', sep='\t')
nb_val_df.head(5)

Unnamed: 0,text,sentiment_score
0,So I'm sure that's probably where he started g...,0.0
1,living room we get living room was like that a...,-1.0
2,Thank you every Every single person who's neve...,1.0
3,"Yeah. Yeah, right then we will now move on we ...",1.0
4,I do I do. That's fucking disgusting dude. You...,1.0


In [8]:
nb_target_labels = list(nb_val_df['sentiment_score'].values)
nb_predicted_labels = []
for sample in tqdm(nb_val_df['text']):
    nb_predicted_sentiment = SentiWordNet_sentiment(sample, binary=False)
    nb_predicted_labels.append(nb_predicted_sentiment)

print(classification_report(nb_target_labels, nb_predicted_labels))

100%|██████████| 1292/1292 [00:18<00:00, 71.67it/s]

              precision    recall  f1-score   support

        -1.0       0.52      0.42      0.47       436
         0.0       0.24      0.08      0.12       197
         1.0       0.59      0.78      0.67       659

    accuracy                           0.55      1292
   macro avg       0.45      0.43      0.42      1292
weighted avg       0.51      0.55      0.52      1292






In [9]:
# Visualize
import matplotlib.pyplot as plt
plt.figure(figsize=(16,5))
plt.xlabel('Utterance number')
plt.ylabel('Sentiment')
plt.plot(range(0, len(dialogue_sentiment_SentiWordNet)), dialogue_sentiment_SentiWordNet)

NameError: name 'dialogue_sentiment_SentiWordNet' is not defined

## VADER Sentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
analyser = SentimentIntensityAnalyzer()

def VADER_sentiment(sentence):
    score = analyser.polarity_scores(sentence)
    return score['pos']-score['neg']

In [None]:
dialogue_sentiment_vader = []
for sentence in sentences: 
    sentiment = VADER_sentiment(sentence)
    print(sentence, sentiment)
    dialogue_sentiment_vader.append(sentiment)
    
print(sum(dialogue_sentiment_vader)/len(dialogue_sentiment_vader))

In [None]:
# Visualize
import matplotlib.pyplot as plt
plt.figure(figsize=(16,5))
plt.xlabel('Utterance number')
plt.ylabel('Sentiment')
plt.plot(range(0, len(dialogue_sentiment_vader)), dialogue_sentiment_vader)

## flair

In [None]:
# do on colab

In [None]:
print("apple" < "banana")
print("Unicode of a", ord("a"))
print("Unicode of b", ord("b"))
print("Unicode of A", ord("A"))
print("Unicode of B", ord("B"))

## SentiWordNet context-based (manually)

In [None]:
# create manual context based sentiwordnet analyzer, usingn phrases as well

## Train a classifier with VADER or SentiWordNet-sentiment labeled

## Useful links to explore

https://github.com/AntoinePassemiers/Lexicon-Based-Sentiment-Analysis

https://medium.com/@datamonsters/sentiment-analysis-tools-overview-part-1-positive-and-negative-words-databases-ae35431a470c