In [1]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [4]:
def get_data_Valder():
    # Load VADER sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()

    # Load the JSON file into a DataFrame
    file_path = 'data/sarcasm_data.json'
    df = pd.read_json(file_path).transpose()

    # Reset the index to turn the first element into a new column
    df = df.reset_index()

    # Define a function to apply sentiment analysis to a text
    def get_sentiment(text):
        return analyzer.polarity_scores(text)

    # Apply sentiment analysis to the 'utterance' column
    df['sentiment_utterance'] = df['utterance'].apply(get_sentiment)

    # Apply sentiment analysis to the 'context' column
    df['sentiment_context_all'] = df['context'].apply(get_sentiment)

    # Apply sentiment analysis to each sentence in the 'context' column
    df['sentiment_context_per_sentence'] = df['context'].apply(lambda context: [get_sentiment(sentence) for sentence in context])

    return df

# Add one extra predictor as an array call "sentiment_features" (logic then is One-Hot encoded)

In [20]:
# U : Uterance sentiment
# oC : Overall context sentiment
# pC : Per sentence context sentiment

# Add 1 with compound // Add 2 without it

## Valder

### Uterance sentiments with pos, neu, neg and compound

In [None]:
df = get_data_Valder()

# Transform the dictionaries into arrays of scores
df['sentiment_features'] = df['sentiment_utterance'].apply(lambda x: list(x.values()))

# Get the keys from the dictionaries and add a prefix
utterance_keys = ['utterance_' + key for key in df['sentiment_utterance'].apply(lambda x: list(x.keys())).iloc[0]]

# Transpose the dataframe to get the orignal json format
df = df.transpose()

df.to_json('sarcasm_data_sentiment_U1.json')

df.iloc[-1], utterance_keys

(0        [0.0, 0.783, 0.217, 0.3612]
 1          [0.18, 0.82, 0.0, -0.296]
 2               [0.0, 1.0, 0.0, 0.0]
 3      [0.058, 0.851, 0.091, 0.4215]
 4               [0.0, 1.0, 0.0, 0.0]
                    ...              
 685     [0.102, 0.898, 0.0, -0.5106]
 686      [0.0, 0.858, 0.142, 0.3595]
 687      [0.0, 0.763, 0.237, 0.4215]
 688      [0.0, 0.781, 0.219, 0.4215]
 689       [0.0, 0.527, 0.473, 0.659]
 Name: sentiment_features, Length: 690, dtype: object,
 ['utterance_neg', 'utterance_neu', 'utterance_pos', 'utterance_compound'])

### Uterance sentiments with pos, neu, neg

In [None]:
df = get_data_Valder()

def remove_compound(dict):
    return {key: value for key, value in dict.items() if key != 'compound'}

# Apply the function to the 'sentiment_utterance' columns
df['sentiment_utterance'] = df['sentiment_utterance'].apply(remove_compound)
df['sentiment_features'] = df['sentiment_utterance'].apply(lambda x: list(x.values())) 

# Get the keys from the dictionaries and add a prefix
utterance_keys = ['utterance_' + key for key in df['sentiment_utterance'].apply(lambda x: list(x.keys())).iloc[0]]

# Transpose the dataframe to get the original json format
df = df.transpose()

df.to_json('sarcasm_data_sentiment_U2.json')

df.iloc[-1], utterance_keys

(0        [0.0, 0.783, 0.217]
 1          [0.18, 0.82, 0.0]
 2            [0.0, 1.0, 0.0]
 3      [0.058, 0.851, 0.091]
 4            [0.0, 1.0, 0.0]
                ...          
 685      [0.102, 0.898, 0.0]
 686      [0.0, 0.858, 0.142]
 687      [0.0, 0.763, 0.237]
 688      [0.0, 0.781, 0.219]
 689      [0.0, 0.527, 0.473]
 Name: sentiment_features, Length: 690, dtype: object,
 ['utterance_neg', 'utterance_neu', 'utterance_pos'])

### Valder sentiment context overall + utterance with pos, neu, neg and compound

In [15]:
df = get_data_Valder()

# Get the keys from the dictionaries and add a prefix
utterance_keys = ['utterance_' + key for key in df['sentiment_utterance'].apply(lambda x: list(x.keys())).iloc[0]]
context_keys = ['context_' + key for key in df['sentiment_context_all'].apply(lambda x: list(x.keys())).iloc[0]]
keys_list = utterance_keys + context_keys

# Transform the dictionaries into arrays of scores
df['sentiment_features'] = df.apply(lambda row: list(row['sentiment_utterance'].values()) + list(row['sentiment_context_all'].values()), axis=1)

# Transpose the dataframe to get the original json format
df = df.transpose()

# Save the DataFrame to a new JSON file
df.to_json('sarcasm_data_sentiment_UoC1.json')

df.iloc[-1], keys_list

(0        [0.0, 0.783, 0.217, 0.3612, 0.0, 1.0, 0.0, 0.0]
 1      [0.18, 0.82, 0.0, -0.296, 0.0, 0.871, 0.129, 0...
 2       [0.0, 1.0, 0.0, 0.0, 0.143, 0.857, 0.0, -0.4874]
 3      [0.058, 0.851, 0.091, 0.4215, 0.0, 0.906, 0.09...
 4      [0.0, 1.0, 0.0, 0.0, 0.097, 0.815, 0.088, 0.1513]
                              ...                        
 685     [0.102, 0.898, 0.0, -0.5106, 0.0, 1.0, 0.0, 0.0]
 686    [0.0, 0.858, 0.142, 0.3595, 0.062, 0.751, 0.18...
 687    [0.0, 0.763, 0.237, 0.4215, 0.506, 0.494, 0.0,...
 688      [0.0, 0.781, 0.219, 0.4215, 0.0, 1.0, 0.0, 0.0]
 689    [0.0, 0.527, 0.473, 0.659, 0.159, 0.736, 0.105...
 Name: sentiment_features, Length: 690, dtype: object,
 ['utterance_neg',
  'utterance_neu',
  'utterance_pos',
  'utterance_compound',
  'context_neg',
  'context_neu',
  'context_pos',
  'context_compound'])

### Valder sentiment context overall + utterance with pos, neu, neg 

In [14]:
df = get_data_Valder()

def remove_compound(dict):
    return {key: value for key, value in dict.items() if key != 'compound'}

# Apply the function to the 'sentiment_utterance' and 'sentiment_context_all' columns
df['sentiment_utterance'] = df['sentiment_utterance'].apply(remove_compound)
df['sentiment_context_all'] = df['sentiment_context_all'].apply(remove_compound)

# Get the keys from the dictionaries and add a prefix
utterance_keys = ['utterance_' + key for key in df['sentiment_utterance'].apply(lambda x: list(x.keys())).iloc[0]]
context_keys = ['context_' + key for key in df['sentiment_context_all'].apply(lambda x: list(x.keys())).iloc[0]]
keys_list = utterance_keys + context_keys

# Transform the dictionaries into arrays of scores
df['sentiment_features'] = df['sentiment_utterance'].apply(lambda x: list(x.values())) + df['sentiment_context_all'].apply(lambda x: list(x.values()))

# Transpose the dataframe to get the original json format
df = df.transpose()

df.to_json('sarcasm_data_sentiment_UoC2.json')

df.iloc[-1] , keys_list

(0            [0.0, 0.783, 0.217, 0.0, 1.0, 0.0]
 1          [0.18, 0.82, 0.0, 0.0, 0.871, 0.129]
 2            [0.0, 1.0, 0.0, 0.143, 0.857, 0.0]
 3      [0.058, 0.851, 0.091, 0.0, 0.906, 0.094]
 4          [0.0, 1.0, 0.0, 0.097, 0.815, 0.088]
                          ...                   
 685          [0.102, 0.898, 0.0, 0.0, 1.0, 0.0]
 686    [0.0, 0.858, 0.142, 0.062, 0.751, 0.187]
 687      [0.0, 0.763, 0.237, 0.506, 0.494, 0.0]
 688          [0.0, 0.781, 0.219, 0.0, 1.0, 0.0]
 689    [0.0, 0.527, 0.473, 0.159, 0.736, 0.105]
 Name: sentiment_features, Length: 690, dtype: object,
 ['utterance_neg',
  'utterance_neu',
  'utterance_pos',
  'context_neg',
  'context_neu',
  'context_pos'])

### Valder sentiment utterance + contex_per_sentence with pos, neu, neg and compound

In [29]:
# Define a function to extract the 'neg', 'neu', 'pos', and 'compound' values from the first three dictionaries in a list
def extract_values(sentences):
    # Ensure the list has at least 3 dictionaries, padding if necessary
    sentences.extend([{'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}] * (3 - len(sentences)))
    # Extract the values from the first 3 dictionaries
    return [value for sentence in sentences[:3] for key, value in sentence.items()]

df = get_data_Valder()

# Get the keys from the dictionaries and add a prefix
utterance_keys = ['utterance_' + key for key in df['sentiment_utterance'].apply(lambda x: list(x.keys())).iloc[0]]
context_per_sentence_keys = [f"context_{i}_{key}" for i in range(1, 4) for key in ['neg', 'neu', 'pos', 'compound']]
keys_list = utterance_keys + context_per_sentence_keys

# Transform the dictionaries into arrays of scores
df['sentiment_features'] = df.apply(lambda row: list(row['sentiment_utterance'].values()) + extract_values(row['sentiment_context_per_sentence']), axis=1)

# Transpose the dataframe to get the original json format
df = df.transpose()

# Save the DataFrame to a new JSON file
df.to_json('sarcasm_data_sentiment_UpC1.json')

df.iloc[-1], keys_list

(0      [0.0, 0.783, 0.217, 0.3612, 0.0, 1.0, 0.0, 0.0...
 1      [0.18, 0.82, 0.0, -0.296, 0.0, 0.705, 0.295, 0...
 2      [0.0, 1.0, 0.0, 0.0, 0.268, 0.732, 0.0, -0.296...
 3      [0.058, 0.851, 0.091, 0.4215, 0.0, 1.0, 0.0, 0...
 4      [0.0, 1.0, 0.0, 0.0, 0.202, 0.439, 0.36, 0.421...
                              ...                        
 685    [0.102, 0.898, 0.0, -0.5106, 0.0, 1.0, 0.0, 0....
 686    [0.0, 0.858, 0.142, 0.3595, 0.0, 1.0, 0.0, 0.0...
 687    [0.0, 0.763, 0.237, 0.4215, 0.0, 1.0, 0.0, 0.0...
 688    [0.0, 0.781, 0.219, 0.4215, 0.0, 1.0, 0.0, 0.0...
 689    [0.0, 0.527, 0.473, 0.659, 0.159, 0.736, 0.105...
 Name: sentiment_features, Length: 690, dtype: object,
 ['utterance_neg',
  'utterance_neu',
  'utterance_pos',
  'utterance_compound',
  'context_1_neg',
  'context_1_neu',
  'context_1_pos',
  'context_1_compound',
  'context_2_neg',
  'context_2_neu',
  'context_2_pos',
  'context_2_compound',
  'context_3_neg',
  'context_3_neu',
  'context_3_pos',
  'cont

### Valder sentiment utterance + contex_per_sentence with pos, neu, neg

In [37]:
# Define a function to extract the 'neg', 'neu', 'pos', and 'compound' values from the first three dictionaries in a list
def extract_values(sentences):
    # Ensure the list has at least 3 dictionaries, padding if necessary
    sentences.extend([{'neg': 0.0, 'neu': 0.0, 'pos': 0.0}] * (3 - len(sentences)))
    # Extract the values from the first 3 dictionaries
    return [value for sentence in sentences[:3] for key, value in sentence.items()]

def remove_compound_1(dict):
    return {key: value for key, value in dict.items() if key != 'compound'}
def remove_compound_2(sentences):
    return [{key: value for key, value in sentence.items() if key != 'compound'} for sentence in sentences]

df = get_data_Valder()

df['sentiment_utterance'] = df['sentiment_utterance'].apply(remove_compound_1)
df['sentiment_context_per_sentence'] = df['sentiment_context_per_sentence'].apply(remove_compound_2)

# Get the keys from the dictionaries and add a prefix
utterance_keys = ['utterance_' + key for key in df['sentiment_utterance'].apply(lambda x: list(x.keys())).iloc[0]]
context_per_sentence_keys = [f"context_{i}_{key}" for i in range(1, 4) for key in ['neg', 'neu', 'pos']]
keys_list = utterance_keys + context_per_sentence_keys

# Transform the dictionaries into arrays of scores
df['sentiment_features'] = df.apply(lambda row: list(row['sentiment_utterance'].values()) + extract_values(row['sentiment_context_per_sentence']), axis=1)

# Transpose the dataframe to get the original json format
df = df.transpose()

# Save the DataFrame to a new JSON file
df.to_json('sarcasm_data_sentiment_UpC2.json')

df.iloc[-1], keys_list

(0      [0.0, 0.783, 0.217, 0.0, 1.0, 0.0, 0.0, 1.0, 0...
 1      [0.18, 0.82, 0.0, 0.0, 0.705, 0.295, 0.0, 0.30...
 2      [0.0, 1.0, 0.0, 0.268, 0.732, 0.0, 0.0, 1.0, 0...
 3      [0.058, 0.851, 0.091, 0.0, 1.0, 0.0, 0.0, 0.58...
 4      [0.0, 1.0, 0.0, 0.202, 0.439, 0.36, 0.356, 0.6...
                              ...                        
 685    [0.102, 0.898, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0...
 686    [0.0, 0.858, 0.142, 0.0, 1.0, 0.0, 0.0, 1.0, 0...
 687    [0.0, 0.763, 0.237, 0.0, 1.0, 0.0, 0.506, 0.49...
 688    [0.0, 0.781, 0.219, 0.0, 1.0, 0.0, 0.0, 1.0, 0...
 689    [0.0, 0.527, 0.473, 0.159, 0.736, 0.105, 0.0, ...
 Name: sentiment_features, Length: 690, dtype: object,
 ['utterance_neg',
  'utterance_neu',
  'utterance_pos',
  'context_1_neg',
  'context_1_neu',
  'context_1_pos',
  'context_2_neg',
  'context_2_neu',
  'context_2_pos',
  'context_3_neg',
  'context_3_neu',
  'context_3_pos'])

## Harmann

classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
def hartmann_sentiment(text):
    text = classifier(text)
    # Assuming 'text' is already defined and contains the necessary data
    sentiments = [text[0][i]['score'] for i in range(6)]
    sentiment = np.argmax(sentiments)
    return sentiment