### Import libraries

In [1]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline

### Load Data

In [8]:
# Load the JSON file into a DataFrame
file_path = 'data/sarcasm_data.json'
df = pd.read_json(file_path).transpose()

# Reset the index to turn the first element into a new column
df = df.reset_index()

### Valder sentiment Dataframe context overall + utterance

In [12]:
# Load VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define a function to apply sentiment analysis to a text
def get_sentiment(text):
    return analyzer.polarity_scores(text)

# Initialize dataframe
df_v = pd.DataFrame()

# Add columns to df_v
df_v['sentiment_utterance'] = df['utterance'].apply(get_sentiment)
df_v['sentiment_context_all'] = df['context'].apply(get_sentiment)
df_v['sentiment_context_per_sentence'] = df['context'].apply(lambda context: [get_sentiment(sentence) for sentence in context])

# Converts sarcasm True to 1 and False to 0
y = df['sarcasm'].astype(int).tolist()

context_all = df_v["sentiment_context_all"]
context_sentences = df_v["sentiment_context_per_sentence"]
utterance = df_v["sentiment_utterance"]

# Extracting 'neg', 'neu', and 'pos' per sentence and add index to keep context 
X = np.array([ [context_all[i]['neg'], context_all[i]['neu'], context_all[i]['pos'], utterance[i]['neg'], utterance[i]['neu'], utterance[i]['pos']] for i in range(context_all.shape[0]) ])


pd.DataFrame(X, columns=['context_neg', 'context_neu', 'context_pos', 'utterance_neg', 'utterance_neu', 'utterance_pos'])


Unnamed: 0,context_neg,context_neu,context_pos,utterance_neg,utterance_neu,utterance_pos
0,0.000,1.000,0.000,0.000,0.783,0.217
1,0.000,0.871,0.129,0.180,0.820,0.000
2,0.143,0.857,0.000,0.000,1.000,0.000
3,0.000,0.906,0.094,0.058,0.851,0.091
4,0.097,0.815,0.088,0.000,1.000,0.000
...,...,...,...,...,...,...
685,0.000,1.000,0.000,0.102,0.898,0.000
686,0.062,0.751,0.187,0.000,0.858,0.142
687,0.506,0.494,0.000,0.000,0.763,0.237
688,0.000,1.000,0.000,0.000,0.781,0.219


## Valder 3 context sentences + utterance

In [13]:
# Define the keys for the floats
keys = ['neg', 'neu', 'pos']

# Initialize a list to hold the resulting arrays
X_sentences = []

# Process each array (list of dictionaries)
for idx, sentences in enumerate(context_sentences):
    # Ensure each list has at least 3 dictionaries, padding if necessary
    sentences.extend([{'neg': 0.0, 'neu': 0.0, 'pos': 0.0}] * (3 - len(sentences)))

    # Extract the relevant values from the last 3 context sentences and the corresponding utterance
    context_values = [value for sentence in sentences[-3:] for value in (sentence['neg'], sentence['neu'], sentence['pos'])]
    utterance_values = [utterance[idx][key] for key in keys]

    # Combine the context and utterance values
    X_sentences.append(context_values + utterance_values)

# Create a DataFrame with appropriate column names
columns = [f"Ctx_{i}_{key}" for i in range(3, 0, -1) for key in keys] + [f"Utr_{key}" for key in keys]
pd.DataFrame(X_sentences, columns=columns)

Unnamed: 0,Ctx_3_neg,Ctx_3_neu,Ctx_3_pos,Ctx_2_neg,Ctx_2_neu,Ctx_2_pos,Ctx_1_neg,Ctx_1_neu,Ctx_1_pos,Utr_neg,Utr_neu,Utr_pos
0,0.000,1.000,0.000,0.000,1.000,0.000,0.000,0.000,0.00,0.000,0.783,0.217
1,0.000,1.000,0.000,0.000,0.426,0.574,0.000,1.000,0.00,0.180,0.820,0.000
2,0.268,0.732,0.000,0.000,1.000,0.000,0.116,0.884,0.00,0.000,1.000,0.000
3,0.000,1.000,0.000,0.000,0.649,0.351,0.000,1.000,0.00,0.058,0.851,0.091
4,0.349,0.651,0.000,0.000,0.435,0.565,0.000,1.000,0.00,0.000,1.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...
685,0.000,1.000,0.000,0.000,1.000,0.000,0.000,0.000,0.00,0.102,0.898,0.000
686,0.000,1.000,0.000,0.105,0.702,0.193,0.000,0.750,0.25,0.000,0.858,0.142
687,0.000,1.000,0.000,0.506,0.494,0.000,0.000,0.000,0.00,0.000,0.763,0.237
688,0.000,1.000,0.000,0.000,0.000,1.000,0.000,1.000,0.00,0.000,0.781,0.219


### Hartman sentiments context overall

In [14]:
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
def hartmann_sentiment(text):
    text = classifier(text)
    # Assuming 'text' is already defined and contains the necessary data
    sentiments = [text[0][i]['score'] for i in range(6)]
    sentiment = np.argmax(sentiments)
    return sentiment



In [16]:
# Initialize data frame
df_h = pd.DataFrame()

# Apply sentiment analysis to the 'utterance' column
df_h['sentiment_utterance'] = df['utterance'].apply(hartmann_sentiment)

# Apply sentiment analysis to the 'context' column
df_h['sentiment_context_all'] = df['context'].apply(hartmann_sentiment)

df_h

# anger = 0, disgust = 1, fear = 2,  joy = 3, neutral = 4, sadness =  5, surprise = 6

Unnamed: 0,sentiment_utterance,sentiment_context_all
0,4,4
1,4,4
2,4,4
3,4,4
4,4,2
...,...,...
685,0,0
686,1,0
687,4,4
688,1,4


### Hartmann sentiment 3 context sentences + utterance

In [23]:
# Apply sentiment analysis to each sentence in the 'context' column
sentiment_context_per_sentence = df['context'].apply(lambda context: [hartmann_sentiment(sentence) for sentence in context])

# inialize array
X_sentences = []

# Fill array with sentiments
for idx, sentence in enumerate(sentiment_context_per_sentence):
    # Ensure each list has at least 3 context sequences
    sentence.extend([4] * (3 - len(sentence)))# Classify as 4 (Neutral) if no context sentence
    sentence = sentence[:3]
    X_sentences.append(sentence)

# Format dataframe
columns = ["Ctx_1", "Ctx_2", "Ctx_3"]
df_h_3 = pd.DataFrame(X_sentences, columns=columns)
df_h_3["Utt"] = df_h['sentiment_utterance']

# Display dataframe
df_h_3

# anger = 0, disgust = 1, fear = 2,  joy = 3, neutral = 4, sadness =  5, surprise = 6, no sentence = 4

Unnamed: 0,Ctx_1,Ctx_2,Ctx_3,Utt
0,4,5,4,4
1,4,4,4,4
2,4,4,4,4
3,4,0,4,4
4,2,1,1,4
...,...,...,...,...
685,0,4,4,0
686,0,4,1,1
687,4,0,4,4
688,4,4,3,1
