## Load and format data

In [33]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
# Load VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Load the JSON file into a DataFrame
file_path = 'data/sarcasm_data.json'
df = pd.read_json(file_path).transpose()

# Reset the index to turn the first element into a new column
df = df.reset_index()

# Define a function to apply sentiment analysis to a text
def get_sentiment(text):
    return analyzer.polarity_scores(text)

# Apply sentiment analysis to the 'utterance' column
df['sentiment_utterance'] = df['utterance'].apply(get_sentiment)

# Apply sentiment analysis to the 'context' column
df['sentiment_context_all'] = df['context'].apply(get_sentiment)

# Apply sentiment analysis to each sentence in the 'context' column
df['sentiment_context_per_sentence'] = df['context'].apply(lambda context: [get_sentiment(sentence) for sentence in context])

# Visualize the DataFrame
df

Unnamed: 0,index,utterance,speaker,context,context_speakers,show,sarcasm,sentiment_utterance,sentiment_context_all,sentiment_context_per_sentence
0,160,It's just a privilege to watch your mind at work.,SHELDON,[I never would have identified the fingerprint...,"[LEONARD, SHELDON]",BBT,True,"{'neg': 0.0, 'neu': 0.783, 'pos': 0.217, 'comp...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun..."
1,170,I don't think I'll be able to stop thinking ab...,PENNY,[This is one of my favorite places to kick bac...,"[HOWARD, PENNY, HOWARD, HOWARD, HOWARD, PENNY,...",BBT,True,"{'neg': 0.18, 'neu': 0.82, 'pos': 0.0, 'compou...","{'neg': 0.0, 'neu': 0.871, 'pos': 0.129, 'comp...","[{'neg': 0.0, 'neu': 0.705, 'pos': 0.295, 'com..."
2,180,"Since it's not bee season, you can have my epi...",SHELDON,"[Here we go. Pad thai, no peanuts., But does i...","[LEONARD, HOWARD, LEONARD]",BBT,False,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.143, 'neu': 0.857, 'pos': 0.0, 'comp...","[{'neg': 0.268, 'neu': 0.732, 'pos': 0.0, 'com..."
3,190,"Lois Lane is falling, accelerating at an initi...",SHELDON,[A marathon? How many Superman movies are ther...,"[PENNY, SHELDON, PENNY, SHELDON, SHELDON, PENN...",BBT,False,"{'neg': 0.058, 'neu': 0.851, 'pos': 0.091, 'co...","{'neg': 0.0, 'neu': 0.906, 'pos': 0.094, 'comp...","[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun..."
4,1105,I'm just inferring this is a couch because the...,SHELDON,"[Great Caesar's ghost, look at this place., So...","[SHELDON, LEONARD, SHELDON, SHELDON, SHELDON, ...",BBT,True,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.097, 'neu': 0.815, 'pos': 0.088, 'co...","[{'neg': 0.202, 'neu': 0.439, 'pos': 0.36, 'co..."
...,...,...,...,...,...,...,...,...,...,...
685,2169,"Hes not right for the part, and if I suggest h...",CHANDLER,"[What am I gonna do now?, Just pass the tape a...","[CHANDLER, RACHEL]",FRIENDS,True,"{'neg': 0.102, 'neu': 0.898, 'pos': 0.0, 'comp...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun..."
686,2235,"Oh yeah he has a caretaker his older brother, ...",CHANDLER,"[Helo! Anybody in there order a celebrity?, Wh...","[JOEY, PERSON, CHANDLER, PERSON]",FRIENDS,False,"{'neg': 0.0, 'neu': 0.858, 'pos': 0.142, 'comp...","{'neg': 0.062, 'neu': 0.751, 'pos': 0.187, 'co...","[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun..."
687,234,Is it me or the greetings gone downhill around...,CHANDLER,"[Hey, You son of a bitch!]","[CHANDLER, JOEY]",FRIENDS,True,"{'neg': 0.0, 'neu': 0.763, 'pos': 0.237, 'comp...","{'neg': 0.506, 'neu': 0.494, 'pos': 0.0, 'comp...","[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun..."
688,2608,"You are right, by saying nice, I am virtually ...",CHANDLER,"[Did I go to this school?, Hey, there's Missy ...","[CHANDLER, ROSS, CHANDLER, ROSS]",FRIENDS,True,"{'neg': 0.0, 'neu': 0.781, 'pos': 0.219, 'comp...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun..."


### Visualisation format SA context and utterance

In [3]:
df_context = df['sentiment_context_all']

print("context all sequences:", df['sentiment_context_all'][0])
df['sentiment_context_all'][0]
print("context per sequence:", df['sentiment_context_per_sentence'][0])
print("utterance:", df['sentiment_utterance'][0] )

context all sequences: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
context per sequence: [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}]
utterance: {'neg': 0.0, 'neu': 0.783, 'pos': 0.217, 'compound': 0.3612}


# Format Data for Training

In [119]:
# Converts sarcasm True to 1 and False to 0
y = df['sarcasm'].astype(int).tolist()

context_all = df["sentiment_context_all"]
context_sentences = df["sentiment_context_per_sentence"]
utterance = df["sentiment_utterance"]

# Extracting 'neg', 'neu', and 'pos' per sentence and add index to keep context 
X = np.array([ [context_all[i]['neg'], context_all[i]['neu'], context_all[i]['pos'], utterance[i]['neg'], utterance[i]['neu'], utterance[i]['pos']] for i in range(context_all.shape[0]) ])

X_df = pd.DataFrame(X, columns=['context_neg', 'context_neu', 'context_pos', 'utterance_neg', 'utterance_neu', 'utterance_pos'])

# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Visualize dataset

In [120]:
X_df

Unnamed: 0,context_neg,context_neu,context_pos,utterance_neg,utterance_neu,utterance_pos
0,0.000,1.000,0.000,0.000,0.783,0.217
1,0.000,0.871,0.129,0.180,0.820,0.000
2,0.143,0.857,0.000,0.000,1.000,0.000
3,0.000,0.906,0.094,0.058,0.851,0.091
4,0.097,0.815,0.088,0.000,1.000,0.000
...,...,...,...,...,...,...
685,0.000,1.000,0.000,0.102,0.898,0.000
686,0.062,0.751,0.187,0.000,0.858,0.142
687,0.506,0.494,0.000,0.000,0.763,0.237
688,0.000,1.000,0.000,0.000,0.781,0.219


In [121]:
# Extracted keys for the floats
keys = ['neg', 'neu', 'pos']

# Initialize a list to hold the resulting arrays
X_sentences = []

# Update sarcasm labels
y_sentences = []

# Process each array (list of dictionaries)
for idx, sentences in enumerate(context_sentences):
    for entry in sentences:
        # Extract values for each key and concatenate with the index
        values = [entry.get(key, 0) for key in keys]

        # Create entries like so: index, negative score, neutral score, positive score 
        X_sentences.append([idx] + values + [utterance[idx]['neg'] , utterance[idx]['neu'] , utterance[idx]['pos']])

        # Update labels
        y_sentences.append(y[idx])
    
print("The size of the y and x match:",len(X_sentences)==len(y_sentences))

# Split the dataset into 80% training and 20% testing
Xs_train, Xs_test, ys_train, ys_test = train_test_split(X, y, test_size=0.2, random_state=42)

The size of the y and x match: True


## Visualize Final Dataset

In [118]:
pd.DataFrame(X_sentences, columns=['utterance idx', 'sentence_neg', 'sentence_neu' ,'sentence_pos', 'utterance_neg', 'utterance_neu', 'utterance_pos'])

Unnamed: 0,utterance idx,sentence_neg,sentence_neu,sentence_pos,utterance_neg,utterance_neu,utterance_pos
0,0,0.000,1.000,0.000,0.00,0.783,0.217
1,0,0.000,1.000,0.000,0.00,0.783,0.217
2,1,0.000,0.705,0.295,0.18,0.820,0.000
3,1,0.000,0.303,0.697,0.18,0.820,0.000
4,1,0.000,0.732,0.268,0.18,0.820,0.000
...,...,...,...,...,...,...,...
2256,688,0.000,1.000,0.000,0.00,0.781,0.219
2257,688,0.000,1.000,0.000,0.00,0.781,0.219
2258,688,0.000,0.000,1.000,0.00,0.781,0.219
2259,688,0.000,1.000,0.000,0.00,0.781,0.219


# Training Classifier

## Overal context score

In [34]:
# Create a Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(report)

Accuracy: 0.49
Confusion Matrix:
[[42 11]
 [60 25]]
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.79      0.54        53
           1       0.69      0.29      0.41        85

    accuracy                           0.49       138
   macro avg       0.55      0.54      0.48       138
weighted avg       0.59      0.49      0.46       138



# Sentence specific context score

In [98]:
# Create a Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(Xs_train, ys_train)

# Predict on the test data
ys_pred = model.predict(Xs_test)

# Evaluate the model
accuracy = accuracy_score(ys_test, ys_pred)
conf_matrix = confusion_matrix(ys_test, ys_pred)
report = classification_report(ys_test, ys_pred)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(report)

Accuracy: 0.54
Confusion Matrix:
[[34 19]
 [45 40]]
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.64      0.52        53
           1       0.68      0.47      0.56        85

    accuracy                           0.54       138
   macro avg       0.55      0.56      0.54       138
weighted avg       0.58      0.54      0.54       138

