In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/NLP_Project/MUStARD/data

/content/drive/MyDrive/NLP_Project/MUStARD/data


In [3]:
!pip install sent2vec

Collecting sent2vec
  Downloading https://files.pythonhosted.org/packages/4e/c6/1f57065edbc772d9529e4a5f75cb812f29bcc2bf59b8e4c34c8ecfd83fe3/sent2vec-0.2.0-py3-none-any.whl
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 5.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 36.1MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 42.0MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading htt

## Data Load

In [4]:
import json

data = json.loads(open('sarcasm_data.json', 'r').read())

print(len(data.keys()))

690


In [5]:
import pandas as pd
import numpy as np

df = pd.DataFrame.from_dict(data).transpose()

print(df.iloc[0])
print(df.head())

utterance           It's just a privilege to watch your mind at work.
speaker                                                       SHELDON
context             [I never would have identified the fingerprint...
context_speakers                                   [LEONARD, SHELDON]
show                                                              BBT
sarcasm                                                          True
Name: 1_60, dtype: object
                                               utterance  speaker  ... show sarcasm
1_60   It's just a privilege to watch your mind at work.  SHELDON  ...  BBT    True
1_70   I don't think I'll be able to stop thinking ab...    PENNY  ...  BBT    True
1_80   Since it's not bee season, you can have my epi...  SHELDON  ...  BBT   False
1_90   Lois Lane is falling, accelerating at an initi...  SHELDON  ...  BBT   False
1_105  I'm just inferring this is a couch because the...  SHELDON  ...  BBT    True

[5 rows x 6 columns]


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import pickle
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()


def preproc(text):
    text = word_tokenize(text)
    n_text = []
    for w in text:
        w = w.lower()
        if(w in stopwords.words('english')):
            continue
        w = lemmatizer.lemmatize(w)
        n_text.append(w)
    return " ".join(n_text)

In [8]:
utt_n_gram = CountVectorizer(ngram_range=(1,3), binary=False, min_df=3, preprocessor=preproc)

utt_n_gram.fit(df['utterance'])

output = utt_n_gram.transform(df['utterance'])
features_df = pd.DataFrame.sparse.from_spmatrix(output, columns=utt_n_gram.get_feature_names())
# features_df.set_index(df.index, inplace=True)
print(features_df.head())
print(utt_n_gram.get_feature_names())

   able  actually  ago  ah  almost  alone  ...  world  would  wow  yeah  year  yes
0     0         0    0   0       0      0  ...      0      0    0     0     0    0
1     1         0    0   0       0      0  ...      0      0    0     0     0    0
2     0         0    0   0       0      0  ...      0      0    0     0     0    0
3     0         0    0   0       0      0  ...      0      0    0     0     0    0
4     0         0    0   0       0      0  ...      0      0    0     0     0    0

[5 rows x 331 columns]
['able', 'actually', 'ago', 'ah', 'almost', 'alone', 'always', 'amy', 'anymore', 'anything', 'anyway', 'apparently', 'around', 'ask', 'asked', 'back', 'bad', 'beer', 'behind', 'believe', 'bernadette', 'better', 'big', 'bit', 'blanche', 'body', 'book', 'book store', 'box', 'boy', 'buy', 'ca', 'call', 'called', 'came', 'car', 'care', 'chandler', 'character', 'coffee', 'come', 'comic', 'comic book', 'comic book store', 'coming', 'contest', 'cool', 'corner', 'couch', 'could', '

## Speaker features

In [9]:
speaker_feat = CountVectorizer(ngram_range=(1,1), binary=True, min_df=1)

speaker_feat.fit(df['speaker'])
output = speaker_feat.transform(df['speaker'])
speakers_df = pd.DataFrame.sparse.from_spmatrix(output, columns=speaker_feat.get_feature_names())
# speakers_df.set_index(df.index, inplace=True)
print(speakers_df.head())
print(speaker_feat.get_feature_names())

features_df = pd.concat([features_df, speakers_df], axis=1)

   amy  bernadette  boy  chandler  dorothy  ...  rachel  raj  rose  ross  sheldon
0    0           0    0         0        0  ...       0    0     0     0        1
1    0           0    0         0        0  ...       0    0     0     0        0
2    0           0    0         0        0  ...       0    0     0     0        1
3    0           0    0         0        0  ...       0    0     0     0        1
4    0           0    0         0        0  ...       0    0     0     0        1

[5 rows x 22 columns]
['amy', 'bernadette', 'boy', 'chandler', 'dorothy', 'girl', 'howard', 'joey', 'leonard', 'member', 'moderator', 'monica', 'penny', 'person', 'person1', 'person3', 'phoebe', 'rachel', 'raj', 'rose', 'ross', 'sheldon']


## Context Features

In [10]:
print(type(df.iloc[0]['context']))
print(df.iloc[0]['context'])
print(df.iloc[0]['context_speakers'])

def join_context(l):
    return " ".join(l)

df['joint_context'] = df['context'].apply(join_context)
df['joint_context_speakers'] = df['context_speakers'].apply(join_context)
print(df.iloc[0]['joint_context'])
print(df.iloc[0]['joint_context_speakers'])

<class 'list'>
['I never would have identified the fingerprints of string theory in the aftermath of the Big Bang.', "My apologies. What's your plan?"]
['LEONARD', 'SHELDON']
I never would have identified the fingerprints of string theory in the aftermath of the Big Bang. My apologies. What's your plan?
LEONARD SHELDON


In [11]:
context_n_gram = CountVectorizer(ngram_range=(1,3), binary=False, min_df=3, preprocessor=preproc)

context_n_gram.fit(df['joint_context'])

output = context_n_gram.transform(df['joint_context'])
context_df = pd.DataFrame.sparse.from_spmatrix(output, columns=context_n_gram.get_feature_names())
# context_df.set_index(df.index, inplace=True)
print(context_df.head())
print(context_n_gram.get_feature_names())

features_df = pd.concat([features_df, context_df], axis=1)

   00  20  able  account  actually  ...  yes  yes know  yet  you  you you
0   0   0     0        0         0  ...    0         0    0    0        0
1   0   0     0        0         0  ...    0         0    0    0        0
2   0   0     0        0         0  ...    0         0    0    0        0
3   0   0     0        0         0  ...    1         1    0    0        0
4   0   0     0        0         0  ...    0         0    0    0        0

[5 rows x 799 columns]
['00', '20', 'able', 'account', 'actually', 'ago', 'ah', 'almost', 'alone', 'alone know', 'along', 'already', 'alright', 'also', 'although', 'always', 'amazing', 'amazon', 'amy', 'another', 'answer', 'anybody', 'anymore', 'anyone', 'anything', 'anyway', 'apartment', 'apologize', 'approach', 'aquaman', 'arm', 'around', 'ask', 'asked', 'attack', 'aw', 'away', 'baby', 'back', 'bad', 'bag', 'ball', 'bar', 'barely', 'bath', 'bathroom', 'batman', 'bear', 'beautiful', 'bed', 'beer', 'believe', 'bernadette', 'besides', 'best', 'bet', 

In [12]:
context_speaker_feat = CountVectorizer(ngram_range=(1,1), binary=True, min_df=1)

context_speaker_feat.fit(df['joint_context_speakers'])
output = context_speaker_feat.transform(df['joint_context_speakers'])
context_speakers_df = pd.DataFrame.sparse.from_spmatrix(output, columns=context_speaker_feat.get_feature_names())
# context_speakers_df.set_index(df.index, inplace=True)
print(context_speakers_df.head())
print(context_speaker_feat.get_feature_names())

features_df = pd.concat([features_df, context_speakers_df], axis=1)

   amy  bernadette  blanche  boy  chandler  ...  rose  ross  scott  sheldon  sophia
0    0           0        0    0         0  ...     0     0      0        1       0
1    0           0        0    0         0  ...     0     0      0        0       0
2    0           0        0    0         0  ...     0     0      0        0       0
3    0           0        0    0         0  ...     0     0      0        1       0
4    0           0        0    0         0  ...     0     0      0        1       0

[5 rows x 25 columns]
['amy', 'bernadette', 'blanche', 'boy', 'chandler', 'dorothy', 'girl', 'howard', 'joey', 'leonard', 'member', 'moderator', 'monica', 'penny', 'person', 'person1', 'person2', 'phoebe', 'rachel', 'raj', 'rose', 'ross', 'scott', 'sheldon', 'sophia']


## Classification

In [13]:
def get_labels(text):
    if(text == True):
        return 1
    return 0

df['label'] = df['sarcasm'].apply(get_labels)
print(df.head())
print(features_df.head())

                                               utterance  ... label
1_60   It's just a privilege to watch your mind at work.  ...     1
1_70   I don't think I'll be able to stop thinking ab...  ...     1
1_80   Since it's not bee season, you can have my epi...  ...     0
1_90   Lois Lane is falling, accelerating at an initi...  ...     0
1_105  I'm just inferring this is a couch because the...  ...     1

[5 rows x 9 columns]
   able  actually  ago  ah  almost  ...  rose  ross  scott  sheldon  sophia
0     0         0    0   0       0  ...     0     0      0        1       0
1     1         0    0   0       0  ...     0     0      0        0       0
2     0         0    0   0       0  ...     0     0      0        0       0
3     0         0    0   0       0  ...     0     0      0        1       0
4     0         0    0   0       0  ...     0     0      0        1       0

[5 rows x 1177 columns]


## Vectorization

In [14]:
from sent2vec.vectorizer import Vectorizer

vectorizer = Vectorizer()
vectorizer.bert(df['utterance'])
vectors = vectorizer.vectors

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




In [19]:
v= pd.DataFrame(vectors)
features_df1 = pd.concat([features_df, v], axis=1)

## Prediction

In [30]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, classification_report

skf = StratifiedKFold(n_splits=5, shuffle=True)
y = df['label'].to_numpy()
X = features_df1.to_numpy()
results = []
count=0
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = SVC(kernel='rbf', C=1.0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    results.append(precision_recall_fscore_support(y_test, y_pred, average='weighted'))
    count+=1
    # x, y, z, _ = results[-1]
    # if(z>=0.75):
    if(count==5):
      filename = 'Vectorize_utterance_model.sav'
      pickle.dump(clf, open(filename, 'wb'))
    print(classification_report(y_test, y_pred))
    precision_recall_fscore_support(y_test, y_pred, average='weighted')
    


avg = [0,0,0]
for i in range(5):
    x, y, z, _ = results[i]
    avg[0] += x
    avg[1] += y
    avg[2] += z
avg[0]/=5
avg[1]/=5
avg[2]/=5

print(f"Avg weighted precision: {avg[0]:.3f} :: Avg weighted recall: {avg[1]:.3f} :: Avg weighted F1: {avg[2]:.3f}")

              precision    recall  f1-score   support

           0       0.70      0.62      0.66        69
           1       0.66      0.74      0.70        69

    accuracy                           0.68       138
   macro avg       0.68      0.68      0.68       138
weighted avg       0.68      0.68      0.68       138

              precision    recall  f1-score   support

           0       0.73      0.67      0.70        69
           1       0.69      0.75      0.72        69

    accuracy                           0.71       138
   macro avg       0.71      0.71      0.71       138
weighted avg       0.71      0.71      0.71       138

              precision    recall  f1-score   support

           0       0.74      0.65      0.69        69
           1       0.69      0.77      0.73        69

    accuracy                           0.71       138
   macro avg       0.71      0.71      0.71       138
weighted avg       0.71      0.71      0.71       138

              preci

In [31]:
from sent2vec.vectorizer import Vectorizer

vectorizer1 = Vectorizer()
vectorizer1.bert(df['joint_context'])
vectors1 = vectorizer1.vectors

In [32]:
final_vec = np.concatenate((vectors,vectors1),1)

In [33]:
final_v = pd.DataFrame(final_vec)
features_df2 = pd.concat([features_df, final_v], axis=1)


In [34]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, classification_report

skf = StratifiedKFold(n_splits=5, shuffle=True)
y = df['label'].to_numpy()
X = features_df2.to_numpy()
results = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = SVC(kernel='rbf', C=1.0)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    results.append(precision_recall_fscore_support(y_test, y_pred, average='weighted'))
    print(classification_report(y_test, y_pred))


avg = [0,0,0]
for i in range(5):
    x, y, z, _ = results[i]
    avg[0] += x
    avg[1] += y
    avg[2] += z
avg[0]/=5
avg[1]/=5
avg[2]/=5

print(f"Avg weighted precision: {avg[0]:.3f} :: Avg weighted recall: {avg[1]:.3f} :: Avg weighted F1: {avg[2]:.3f}")

              precision    recall  f1-score   support

           0       0.63      0.78      0.70        69
           1       0.71      0.54      0.61        69

    accuracy                           0.66       138
   macro avg       0.67      0.66      0.65       138
weighted avg       0.67      0.66      0.65       138

              precision    recall  f1-score   support

           0       0.61      0.83      0.70        69
           1       0.73      0.46      0.57        69

    accuracy                           0.64       138
   macro avg       0.67      0.64      0.63       138
weighted avg       0.67      0.64      0.63       138

              precision    recall  f1-score   support

           0       0.65      0.71      0.68        69
           1       0.68      0.62      0.65        69

    accuracy                           0.67       138
   macro avg       0.67      0.67      0.67       138
weighted avg       0.67      0.67      0.67       138

              preci