In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/My Drive/NLP/Project/MUStARD/data

/content/drive/My Drive/NLP/Project/MUStARD/data


## Data Load

In [None]:
import json

data = json.loads(open('sarcasm_data.json', 'r').read())

print(len(data.keys()))

690


In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame.from_dict(data).transpose()

print(df.iloc[0])
print(df.head())

utterance           It's just a privilege to watch your mind at work.
speaker                                                       SHELDON
context             [I never would have identified the fingerprint...
context_speakers                                   [LEONARD, SHELDON]
show                                                              BBT
sarcasm                                                          True
Name: 1_60, dtype: object
                                               utterance  speaker  ... show sarcasm
1_60   It's just a privilege to watch your mind at work.  SHELDON  ...  BBT    True
1_70   I don't think I'll be able to stop thinking ab...    PENNY  ...  BBT    True
1_80   Since it's not bee season, you can have my epi...  SHELDON  ...  BBT   False
1_90   Lois Lane is falling, accelerating at an initi...  SHELDON  ...  BBT   False
1_105  I'm just inferring this is a couch because the...  SHELDON  ...  BBT    True

[5 rows x 6 columns]


## Word n-gram features


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk
nltk.download('all')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

In [None]:
def preproc(text):
    text = word_tokenize(text)
    n_text = []
    for w in text:
        w = w.lower()
        if(w in stopwords.words('english')):
            continue
        w = lemmatizer.lemmatize(w)
        n_text.append(w)
    return " ".join(n_text)


# print(preproc("It's just a privilege to watch your minds at work."))


In [None]:

def get_word_ngram_features(df):

    utt_n_gram = CountVectorizer(ngram_range=(1,3), binary=False, min_df=3, preprocessor=preproc)

    utt_n_gram.fit(df['utterance'])

    output = utt_n_gram.transform(df['utterance'])
    ngram_df = pd.DataFrame.sparse.from_spmatrix(output, columns=utt_n_gram.get_feature_names())
    # features_df.set_index(df.index, inplace=True)
    # print(ngram_df.head())
    # print(utt_n_gram.get_feature_names())
    return ngram_df


## Speaker features

In [None]:

def get_speaker_features(df):

    speaker_feat = CountVectorizer(ngram_range=(1,1), binary=True, min_df=1)

    speaker_feat.fit(df['speaker'])
    output = speaker_feat.transform(df['speaker'])
    speakers_df = pd.DataFrame.sparse.from_spmatrix(output, columns=speaker_feat.get_feature_names())
    # speakers_df.set_index(df.index, inplace=True)
    # print(speakers_df.head())
    # print(speaker_feat.get_feature_names())
    return speakers_df


## Context Features

In [None]:
def join_context(l):
    return " ".join(l)

In [None]:
def get_context_features(df):
    df['joint_context'] = df['context'].apply(join_context)
    df['joint_context_speakers'] = df['context_speakers'].apply(join_context)

    context_n_gram = CountVectorizer(ngram_range=(1,3), binary=False, min_df=3, preprocessor=preproc)

    context_n_gram.fit(df['joint_context'])

    output = context_n_gram.transform(df['joint_context'])
    context_df = pd.DataFrame.sparse.from_spmatrix(output, columns=context_n_gram.get_feature_names())
    # context_df.set_index(df.index, inplace=True)
    # print(context_df.head())
    # print(context_n_gram.get_feature_names())
    return context_df

# features_df = pd.concat([features_df, context_df], axis=1)

## Context Speakers


In [None]:
def get_context_speaker_features(df):

    context_speaker_feat = CountVectorizer(ngram_range=(1,1), binary=True, min_df=1)

    context_speaker_feat.fit(df['joint_context_speakers'])
    output = context_speaker_feat.transform(df['joint_context_speakers'])
    context_speakers_df = pd.DataFrame.sparse.from_spmatrix(output, columns=context_speaker_feat.get_feature_names())
    # context_speakers_df.set_index(df.index, inplace=True)
    # print(context_speakers_df.head())
    # print(context_speaker_feat.get_feature_names())
    return context_speakers_df

# features_df = pd.concat([features_df, context_speakers_df], axis=1)

## Show

In [None]:
def get_show_features(df):
    show_feat = CountVectorizer(ngram_range=(1,1), binary=True, min_df=1)

    show_feat.fit(df['show'])
    output = show_feat.transform(df['show'])
    shows_df = pd.DataFrame.sparse.from_spmatrix(output, columns=show_feat.get_feature_names())
    # shows_df.set_index(df.index, inplace=True)
    # print(shows_df.head())
    # print(show_feat.get_feature_names())
    return shows_df

## Classification

In [None]:
def get_labels(text):
    if(text == True):
        return 1
    return 0

df['label'] = df['sarcasm'].apply(get_labels)
print(df.head())

                                               utterance  ... label
1_60   It's just a privilege to watch your mind at work.  ...     1
1_70   I don't think I'll be able to stop thinking ab...  ...     1
1_80   Since it's not bee season, you can have my epi...  ...     0
1_90   Lois Lane is falling, accelerating at an initi...  ...     0
1_105  I'm just inferring this is a couch because the...  ...     1

[5 rows x 7 columns]


In [84]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, classification_report, recall_score, f1_score, accuracy_score
import pickle

In [None]:
def concat(df1, df2):
    if(df1 is None):
        return df2
    return pd.concat([df1, df2], axis=1)   

%cd /content/drive/My Drive/NLP/Project/models

/content/drive/My Drive/NLP/Project/models


In [94]:
b_utt, b_speaker, b_context, b_context_speakers, b_show = True, True, False, False, False

features_df = None

if(b_utt):
    utt_df = get_word_ngram_features(df)
    features_df = concat(features_df, utt_df)

if(b_speaker):
    speaker_df = get_speaker_features(df)
    features_df = concat(features_df, speaker_df)

if(b_context):
    context_df = get_context_features(df)
    features_df = concat(features_df, context_df)

if(b_context_speakers):
    context_speakers_df = get_context_speaker_features(df)
    features_df = concat(features_df, context_speakers_df)

if(b_show):
    show_df = get_show_features(df)
    features_df = concat(features_df, show_df)

skf = StratifiedKFold(n_splits=5, shuffle=True)
y = df['label'].to_numpy()
X = features_df.to_numpy()
precisions = []
recalls = []
f1s = []
accuracies = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # clf = SVC(kernel='rbf', C=1.0)
    clf = MLPClassifier(hidden_layer_sizes=(100, 50, 10, ), max_iter=500)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    precision = precision_score(y_test, y_pred, average='weighted')
    precisions.append(precision)
    recall = recall_score(y_test, y_pred, average='weighted')
    recalls.append(recall)
    f1 = f1_score(y_test, y_pred, average='weighted')
    f1s.append(f1)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    # if(f1>=0.76):
    #     with open('word_ngram(utt+speaker+context+context_speakers)', 'wb') as f:
    #         pickle.dump(clf, f)
    #         f.close()
    #     print('Pickled')


    print(classification_report(y_test, y_pred))

avg_precision = np.mean(np.array(precisions))
avg_recall = np.mean(np.array(recalls))
avg_f1 = np.mean(np.array(f1s))
avg_accuracy = np.mean(np.array(accuracies))

print(f"Avg accuracy: {avg_accuracy:.3f}")
print(f"Avg weighted precision: {avg_precision:.3f} :: Avg weighted recall: {avg_recall:.3f} :: Avg weighted F1: {avg_f1:.3f}")

              precision    recall  f1-score   support

           0       0.63      0.70      0.66        69
           1       0.66      0.59      0.63        69

    accuracy                           0.64       138
   macro avg       0.65      0.64      0.64       138
weighted avg       0.65      0.64      0.64       138

              precision    recall  f1-score   support

           0       0.74      0.65      0.69        69
           1       0.69      0.77      0.73        69

    accuracy                           0.71       138
   macro avg       0.71      0.71      0.71       138
weighted avg       0.71      0.71      0.71       138

              precision    recall  f1-score   support

           0       0.60      0.57      0.58        69
           1       0.59      0.62      0.61        69

    accuracy                           0.59       138
   macro avg       0.59      0.59      0.59       138
weighted avg       0.59      0.59      0.59       138

              preci