## Lexicon-based approach

In [31]:
import json
import pandas as pd
from nltk import word_tokenize
import string
import numpy as np
from nltk.corpus import sentiwordnet as swn
import nltk
from tqdm import tqdm
from os import listdir
from sklearn.metrics import classification_report

In [32]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/Bart/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Load data

In [33]:
# get list of all paths to the json-files of english episodes given subset number (bart: 0 , juno: 1, joris: 2)

def get_paths_for_en_episodes(subset_number):
    """
    Function returns list of all paths to the json-files of english 
    episodes given subset number (bart: 0 , juno: 1, joris: 2)
    
    """

    path1 = 'podcast_data_no_audio/podcasts-transcripts/' + str(subset_number)

    folders = listdir(path1)

    if '.DS_Store' in folders:
        folders.remove('.DS_Store')

    podcast_episodes_paths = []

    for letter_or_number in tqdm(folders):    
        path2 = path1 + '/' + letter_or_number


        for show_uri in listdir(path2):
            path3 = path2 + '/' + show_uri

            # select english shows only
            show_metadata = metadata_df.loc[metadata_df['show_filename_prefix'] == show_uri]

            if len(show_metadata['language'].unique()) > 0:
                if 'en' in show_metadata['language'].unique()[0]:
                    for episode_uri in listdir(path3):
                        path4 = path3 + '/' + episode_uri

                        if '.json' in path4:
                            podcast_episodes_paths.append(path4)

                
        
    return len(podcast_episodes_paths), podcast_episodes_paths

In [34]:
get_paths_for_en_episodes(0)

  0%|          | 0/36 [00:00<?, ?it/s]


NameError: name 'metadata_df' is not defined

## NLTK SentiWordNet

In [35]:
def SentiWordNet_sentiment(utterance):
    """
    Returns sentiment score for a podcast utterance with tagged tokens 
    using SentiWordNet
    """
    
    # tokenize utterance
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = nltk.word_tokenize(utterance)
    
    # POS tag utterance
    tagged_tokens = nltk.pos_tag(tokens)
    
    # assign sentiment score using SentiWordNet, including synonyms
    tokens_sentiment_scores = []
    for token in tagged_tokens:
        tag = ''
        lemma = lemmatizer.lemmatize(token[0])
        if token[1].startswith('N'):
            tag = 'n'
        elif token[1].startswith('J'):
            tag = 'a'
        elif token[1].startswith('V'):
            tag = 'v'
        elif token[1].startswith('R'):
            tag = 'r'
        if tag != '':
            # also get sentiments for synonyms
            synonyms = list(swn.senti_synsets(lemma, tag)) 
            token_sentiment = 0
            if len(synonyms) > 0:
                for synonym in synonyms:
                    token_sentiment += synonym.pos_score() - synonym.neg_score()
                tokens_sentiment_scores.append(token_sentiment/len(synonyms))      
   
    
    if tokens_sentiment_scores != []:
        sentiment_score = sum(tokens_sentiment_scores)/len(tokens_sentiment_scores) 

        if sentiment_score >= 0:
            return 1
        elif sentiment_score < 0:
            return 0

    else:   
        return 1


### Binary Validation

In [37]:
# load binary validation dataset
val_df = pd.read_csv('labeled_datasets/binary/binary_val.csv', sep='\t')
val_df.head(5)

len(val_df)

1292

In [39]:
# calculate metrics
target_labels = val_df['sentiment_score'].values
predicted_labels = []
for sample in tqdm(val_df['text']):
    predicted_sentiment = SentiWordNet_sentiment(sample)
    predicted_labels.append(predicted_sentiment)

predicted_labels = np.array(predicted_labels)  
print(classification_report(target_labels, predicted_labels))

100%|██████████| 1292/1292 [00:17<00:00, 72.59it/s] 

              precision    recall  f1-score   support

         0.0       0.48      0.42      0.45       405
         1.0       0.75      0.80      0.77       887

    accuracy                           0.68      1292
   macro avg       0.62      0.61      0.61      1292
weighted avg       0.67      0.68      0.67      1292






### Non-binary validation

In [42]:
# load non-binary dataset
nb_val_df = pd.read_csv('labeled_datasets/nonbinary/nonbinary_val.csv', sep='\t')
nb_val_df.head(5)

Unnamed: 0,text,sentiment_score
0,Just so people can understand what he just sai...,1.0
1,"Yeah, I mean small businesses tough enough. So...",1.0
2,"I think Jughead needs to go, you know, it need...",-1.0
3,I don't know it was like I hated it when I was...,1.0
4,"Yeah, so and that is that is also really based...",-1.0


In [44]:
# calculate metrics for each parameter value for the non binary margin
nb_target_labels = list(nb_val_df['sentiment_score'].values)
margin_values = np.linspace(0, 0.02, 21)
for margin in margin_values:
    nb_predicted_labels = []
    for sample in tqdm(nb_val_df['text']):
        nb_predicted_sentiment = SentiWordNet_sentiment(sample)
        nb_predicted_labels.append(nb_predicted_sentiment)
    print('Metrics for margin = {}'.format(margin))    
    print(classification_report(nb_target_labels, nb_predicted_labels))

 24%|██▎       | 305/1292 [00:02<00:09, 102.69it/s]


KeyboardInterrupt: 

## VADER Sentiment

In [9]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [10]:
def VADER_sentiment_classifier(utterance, binary=True, nb_margin=0.001):
    analyser = SentimentIntensityAnalyzer()

    score = analyser.polarity_scores(utterance)['compound']
    
    if binary == False:

        if score >= nb_margin:
            return 1
        elif score < -nb_margin:
            return -1
        else:
            return 0
        
    elif binary == True:
        
        if score >= 0:
            return 1
        elif score < 0:
            return 0



### Binary VADER validation

In [40]:
target_labels = val_df['sentiment_score'].values
predicted_labels = []
for sample in tqdm(val_df['text']):
    predicted_sentiment = VADER_sentiment_classifier(sample, binary=True)
    predicted_labels.append(predicted_sentiment)

predicted_labels = np.array(predicted_labels)  
print(classification_report(target_labels, predicted_labels))

  0%|          | 0/1292 [00:00<?, ?it/s]


NameError: name 'VADER_sentiment_classifier' is not defined

### Non-binary VADER validation

In [12]:
# calculate metrics for each parameter value for the non binary margin
nb_target_labels = list(nb_val_df['sentiment_score'].values)
margin_values = np.linspace(0, 0.02, 21)
for margin in margin_values:
    nb_predicted_labels = []
    for sample in tqdm(nb_val_df['text']):
        nb_predicted_sentiment = VADER_sentiment_classifier(sample, binary=False, nb_margin=margin)
        nb_predicted_labels.append(nb_predicted_sentiment)
    print('Metrics for margin = {}'.format(margin))   
    print(classification_report(nb_target_labels, nb_predicted_labels))

100%|██████████| 1292/1292 [00:31<00:00, 40.44it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  0%|          | 3/1292 [00:00<00:55, 23.19it/s]

Metrics for margin = 0.0
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.00      0.00      0.00       206
         1.0       0.57      0.95      0.71       681

    accuracy                           0.58      1292
   macro avg       0.42      0.40      0.36      1292
weighted avg       0.51      0.58      0.49      1292



100%|██████████| 1292/1292 [00:34<00:00, 42.31it/s]
  0%|          | 5/1292 [00:00<00:31, 41.17it/s]

Metrics for margin = 0.001
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:31<00:00, 40.43it/s]
  0%|          | 5/1292 [00:00<00:28, 44.98it/s]

Metrics for margin = 0.002
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:31<00:00, 44.33it/s]
  0%|          | 5/1292 [00:00<00:31, 41.02it/s]

Metrics for margin = 0.003
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:54<00:00, 39.15it/s]
  0%|          | 3/1292 [00:00<00:47, 26.95it/s]

Metrics for margin = 0.004
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:45<00:00, 25.82it/s]
  0%|          | 4/1292 [00:00<00:38, 33.17it/s]

Metrics for margin = 0.005
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [01:07<00:00, 13.25it/s]
  0%|          | 0/1292 [00:00<?, ?it/s]

Metrics for margin = 0.006
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [01:27<00:00, 14.81it/s]
  0%|          | 3/1292 [00:00<00:58, 22.14it/s]

Metrics for margin = 0.007
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [01:00<00:00, 30.44it/s]
  0%|          | 4/1292 [00:00<00:36, 35.06it/s]

Metrics for margin = 0.008
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:46<00:00, 27.52it/s]
  0%|          | 5/1292 [00:00<00:27, 46.65it/s]

Metrics for margin = 0.009000000000000001
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:53<00:00, 23.96it/s]
  0%|          | 3/1292 [00:00<00:50, 25.37it/s]

Metrics for margin = 0.01
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:56<00:00, 23.01it/s]
  0%|          | 4/1292 [00:00<00:47, 27.15it/s]

Metrics for margin = 0.011
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:46<00:00, 27.78it/s]
  0%|          | 5/1292 [00:00<00:31, 41.27it/s]

Metrics for margin = 0.012
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [01:01<00:00, 20.85it/s]
  0%|          | 5/1292 [00:00<00:30, 42.69it/s]

Metrics for margin = 0.013000000000000001
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:37<00:00, 47.72it/s]
  0%|          | 5/1292 [00:00<00:27, 47.04it/s]

Metrics for margin = 0.014
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:33<00:00, 46.68it/s]
  0%|          | 5/1292 [00:00<00:27, 46.88it/s]

Metrics for margin = 0.015
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:38<00:00, 33.28it/s]
  0%|          | 2/1292 [00:00<01:27, 14.78it/s]

Metrics for margin = 0.016
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:54<00:00, 23.60it/s]
  0%|          | 1/1292 [00:00<03:13,  6.67it/s]

Metrics for margin = 0.017
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:44<00:00, 55.91it/s]
  0%|          | 5/1292 [00:00<00:29, 43.15it/s]

Metrics for margin = 0.018000000000000002
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:40<00:00, 28.76it/s]
  0%|          | 3/1292 [00:00<00:43, 29.55it/s]

Metrics for margin = 0.019
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.38       405
         0.0       0.55      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292



100%|██████████| 1292/1292 [00:47<00:00, 19.56it/s]

Metrics for margin = 0.02
              precision    recall  f1-score   support

        -1.0       0.68      0.26      0.37       405
         0.0       0.54      0.41      0.47       206
         1.0       0.62      0.90      0.74       681

    accuracy                           0.62      1292
   macro avg       0.62      0.52      0.53      1292
weighted avg       0.63      0.62      0.58      1292




