# Embedding VS Word2Vec

In [12]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import os
from keras import *
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer, LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('stopwords', download_dir='./nltk')
nltk.download('wordnet', download_dir='./nltk')

[nltk_data] Downloading package stopwords to ./nltk...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ./nltk...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# load dataframe, delete article link column
pd.set_option('display.max_colwidth', None)
# there are v1 and v2
file_name = 'Sarcasm_Headlines_Dataset.json'
df = pd.read_json(file_name, lines=True)
del df['article_link']
df.head(5)

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'black code' for minority shoppers,0
1,"the 'roseanne' revival catches up to our thorny political mood, for better and worse",0
2,mom starting to fear son's web series closest thing she will have to grandchild,1
3,"boehner just wants wife to listen, not come up with alternative debt-reduction ideas",1
4,j.k. rowling wishes snape happy birthday in the most magical way,0


In [4]:
# split data into train and test set
train_df = df.sample(frac=0.8, random_state=0)
test_df = df.drop(train_df.index)

# separate data from labels
x_train, y_train = train_df['headline'], train_df['is_sarcastic']
x_test, y_test = test_df['headline'], test_df['is_sarcastic']

# Preprocessing

In [5]:
# basic preprocessing
# appostophe words
appos = {
        "aren't": "are not", "can't": "cannot", "couldn't": "could not", "didn't": "did not", "doesn't": "does not", "don't": "do not",
        "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he had", "he'll": "he will", "he's": "he is",
        "I'd": "I had", "I'll": "I will", "I'm": "I am", "I've": "I have", "isn't": "is not", "let's": "let us", "mightn't": "might not",
        "mustn't": "must not", "shan't": "shall not", "she'd": "she had", "she'll": "she will", "she's": "she is", "shouldn't": "should not",
        "what's": "what is", "there's": "there is", "they'd": "they had", "they'll": "they will", "they've": "they have",
        "we'd": "we had", "we're": "we are", "we've": "we have", "weren't": "were not",
}
def transform_appos(headline):
  arr = []
  for row in headline.values:
      for word in row:
          if word in appos.keys():
              row = row.replace(word, appos[word])
  return headline

def to_lowercase(headline):
  headline = headline.apply(lambda x: x.lower())
  return headline

def remove_stop_words(headline):
  stop = stopwords.words('english')
  headline = headline.apply(lambda x: [item for item in x.split() if item not in stop])
  headline = headline.apply(lambda x: ' '.join(map(str, x)))
  return headline

def remove_numbers(headline):
   headline = headline.str.replace('\d+', '')
   return headline

def remove_punctuation(headline):
  headline = headline.str.replace('[^\w\s]','')
  return headline

def remove_whitespaces(headline):
  headline = headline.str.strip()
  return headline
  
def apply_stemming(headline):
   #stemmer = SnowballStemmer("english")
   #stemmer = LancasterStemmer()
   stemmer = PorterStemmer()
   headline = headline.apply(lambda x: [stemmer.stem(y) for y in x.split()])
   return headline

def apply_lemmatization(headline):
   lemmatizer = WordNetLemmatizer()
   headline = headline.apply(lambda x: ' '.join(map(str, x)))
   return headline

def replace_quote(headline):
  headline = headline.str.replace("’", "'")
  return headline

def delete_quotes(headline):
  headline = headline.str.replace("'", "")
  return headline

def preprocess(headlines):
  headlines = to_lowercase(headlines)
  headlines = remove_numbers(headlines)
  headlines = replace_quote(headlines)
  headlines = transform_appos(headlines)
  headlines = remove_punctuation(headlines)
#   df["headline"] = delete_quotes(df['headline'])
  #df['headline'] = remove_stop_words(df['headline'])
  df['headline'] = remove_whitespaces(df['headline'])
  #df['headline'] = apply_stemming(df['headline'])
  #df['headline'] = apply_lemmatization(df['headline'])

  return headlines

In [6]:
# apply preprocessing to train set
x_train = preprocess(x_train)
# apply preprocessing to test set
x_test = preprocess(x_test)



In [7]:
# Gensim preprocessing
import gensim
from gensim.utils import simple_preprocess

sentences = [simple_preprocess(sentence) for sentence in x_train]
print(x_train.head())

10646    breaking israelites in sinai suddenly achieve freedom from pharaoh  good times forecast
1971                          classmatescom employees dont have heart to tell ceo about facebook
19747                             woman unaware shes only person on acid at james taylor concert
14554                          donald trump has a new conspiracy theory this one involves google
19329                                            inside americas love affair with neurotic jewry
Name: headline, dtype: object


In [9]:
# preprocessing - tokenization
vocab_size = len(gensim.models.Word2Vec(sentences).wv.vocab)
max_length = 100
trunc_type='post'
padding_type='post'
oov_token = "<OOV>"


tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(x_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, truncating=trunc_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, truncating=trunc_type, maxlen=max_length)

# Need this block to get it to work with TensorFlow 2.x
train_padded = np.array(train_padded)
test_padded = np.array(test_padded)
y_train = np.array(y_train)

# Create models

In [23]:
def model_builder(use_gensim_embedding: bool):
    model = Sequential()

    # embedding layer
    embedding_dimension = 300
    if use_gensim_embedding is True:
        word2vec = gensim.models.Word2Vec(sentences=sentences, size=embedding_dimension, window=5)
        embedding_layer = word2vec.wv.get_keras_embedding(train_embeddings=True)
        model.add(embedding_layer)
    else:
        model.add(Embedding(vocab_size, output_dim=embedding_dimension, input_length=max_length))
    
    model.add(GlobalAveragePooling1D())
    model.add(Dense(84, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile('adam', 'binary_crossentropy', ['acc'])
    return model

In [13]:
# callbacks
stop_early = EarlyStopping(monitor='val_loss', patience=5)

save_best_weights_w2v = ModelCheckpoint('checkpoints/word2vec', 
                                              monitor='val_loss',
                                              save_best_only=True,
                                              save_weights_only=True)

save_best_weights_embedding = ModelCheckpoint('checkpoints/embedding', 
                                              monitor='val_loss',
                                              save_best_only=True,
                                              save_weights_only=True)

## Train model with simple embedding

In [20]:
model_with_simple_embedding = model_builder(use_gensim_embedding=False)
model_with_simple_embedding.fit(x=train_padded,
                                y=y_train,
                                batch_size=64,
                                epochs=20,
                                validation_split=0.2,
                                callbacks=[stop_early, save_best_weights_embedding])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


<keras.callbacks.History at 0x7fe6fbfc02d0>

## Train model with word2vec embedding

In [24]:
model_with_w2v = model_builder(use_gensim_embedding=True)
model_with_w2v.fit(x=train_padded,
                   y=y_train,
                   batch_size=64,
                   epochs=20,
                   validation_split=0.2,
                   callbacks=[stop_early, save_best_weights_w2v])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


<keras.callbacks.History at 0x7fe6fbd0ead0>

# Load best weights and compare models

In [25]:
model_with_simple_embedding = model_builder(use_gensim_embedding=False)
model_with_simple_embedding.load_weights('checkpoints/embedding')

model_with_w2v = model_builder(use_gensim_embedding=True)
model_with_w2v.load_weights('checkpoints/word2vec')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fe6fbbf50d0>

In [27]:
# loss and accuracy
loss, acc = model_with_simple_embedding.evaluate(test_padded, y_test, batch_size=64, verbose=0)
print(f'*Simple Embedding* \nLoss is: {loss} \nAcc is: {acc} \n')

loss, acc = model_with_w2v.evaluate(test_padded, y_test, batch_size=64, verbose=0)
print(f'*Word2Vec* \nLoss is: {loss} \nAcc is: {acc} \n')



*Simple Embedding* 
Loss is: 0.3550705015659332 
Acc is: 0.8519281148910522 

*Word2Vec* 
Loss is: 0.35363292694091797 
Acc is: 0.850243330001831 



In [28]:
# compare models with f1 score
y_pred_simple_embedding = (model_with_simple_embedding.predict(test_padded) > 0.5).astype("int32")
print('Simple Embedding F1')
print(classification_report(y_test, y_pred_simple_embedding))

y_pred_w2v = (model_with_w2v.predict(test_padded) > 0.5).astype("int32")
print('Word2Vec Embedding F1')
print(classification_report(y_test, y_pred_w2v))

Simple Embedding F1
              precision    recall  f1-score   support

           0       0.88      0.85      0.87      2982
           1       0.82      0.85      0.84      2360

    accuracy                           0.85      5342
   macro avg       0.85      0.85      0.85      5342
weighted avg       0.85      0.85      0.85      5342

Word2Vec Embedding F1
              precision    recall  f1-score   support

           0       0.86      0.87      0.87      2982
           1       0.83      0.83      0.83      2360

    accuracy                           0.85      5342
   macro avg       0.85      0.85      0.85      5342
weighted avg       0.85      0.85      0.85      5342

