In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import pickle
import os
import nltk
nltk.download('punkt')
import numpy as _np
import pandas as _pd
from math import log as _log
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec, KeyedVectors
from sklearn.model_selection import StratifiedKFold, train_test_split
import matplotlib.pyplot as _plt
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [0]:
def _get_training_data():
  #pickle_in_x = open("gdrive/My Drive/Data/pickles/reddit_training_x.pickle","rb")
  pickle_in_y = open("gdrive/My Drive/Data/pickles/reddit_training_y.pickle","rb")
  #x_train = pickle.load(pickle_in_x)
  y_train = pickle.load(pickle_in_y)
  #return x_train, y_train
  return y_train

In [0]:
def _tokenize_sentence(samples):
  _tokens_list = []
  for sentence in samples:
    _tokens_list.append(nltk.word_tokenize(sentence))
  return _tokens_list  

In [0]:
#takes list of tokenized sentences
#returns a list of np.arrays. Each np.array is of size 1x300. The number of np.array equals the number of samples
def _get_embeddings(_tokens):
  print('loading model')
  _filename = 'gdrive/Team Drives/NLP_Text_Summarizer/Data/word2vec/GoogleNews-vectors-negative300.bin'
  _model = KeyedVectors.load_word2vec_format(_filename, binary=True)
  print('model loaded')
  
  _embeddings = []
  for _sentence in _tokens:
    #for every sentence we will initialize a vector of zeros
    _current_vector = _np.zeros(300)
    #It's possible that some words will not have a vector representation. If so we cannot include them in the average
    _excluded_words = 0
    for _word in _sentence:
      try:
        #Add the current word to the vector representing the sentence
        _current_vector = _np.add(_current_vector, _model[_word])
      except KeyError:
        #if the word is not in the model, skip to the next word
        _excluded_words += 1
    #After each word has been vectorized and added together -> take the average
    #_current_vector = _np.divide(_current_vector, max(len(_sentence) - _excluded_words, 1))---->we removed this to test without average
    #append the embedding for this sentence to the list
    _embeddings.append(_current_vector)
  #after all sentences have been vectorized, return the embeddings list
  return _embeddings

In [0]:
#x_train_whole, y_train_whole = _get_training_data()
# x_train_whole = _tokenize_sentence(x_train_whole)
# x_train_whole = _get_embeddings(x_train_whole)
# _np.save('gdrive/My Drive/Data/sarcasm_embeddings/x_train_whole_embeddings_not_averaged', x_train_whole)
y_train_whole = _get_training_data()

In [0]:
def separate_fold(x_train_whole, y_train_whole, train_idx, test_idx):  
#   x_train, x_test = x_train_whole.iloc[train_idx], x_train_whole.iloc[test_idx]
  x_train, x_test = x_train_whole[train_idx], x_train_whole[test_idx]
  y_train, y_test = y_train_whole.iloc[train_idx], y_train_whole.iloc[test_idx]
  return x_train, y_train, _np.array(x_test.tolist()), _np.array(y_test.tolist())

In [0]:
def separate_classes(x_sentences, y_sentences):
  sarc_list = []
  not_sarc_list = []
  for idx,label in enumerate(y_sentences):
    if label == 1:
      sarc_list.append(x_sentences[idx])
    else:
      not_sarc_list.append(x_sentences[idx])
  return _np.array(sarc_list), _np.array(not_sarc_list)

In [0]:
def _train_naive_bayes(_sarc, _not_sarc):
  num_sarc = len(_sarc)
  num_not_sarc = len(_not_sarc)
  num_samples =  num_sarc + num_not_sarc
  prior_sarc = _log(num_sarc/num_samples,10)
  prior_not_sarc = _log(num_not_sarc/num_samples,10)
  
  num_features = _np.size(_sarc, 1)
  
  _average_sarc = _sarc.sum(axis=0)/num_sarc
  _variance_sarc = ((_np.power(_sarc - _average_sarc,2)).sum(axis=0))/num_sarc
  _sarc_tup = (_average_sarc, _variance_sarc, prior_sarc)
  
  _average_not_sarc = _not_sarc.sum(axis=0)/num_not_sarc
  _variance_not_sarc = ((_np.power(_not_sarc - _average_not_sarc,2)).sum(axis=0))/num_not_sarc
  _not_sarc_tup = (_average_not_sarc, _variance_not_sarc, prior_not_sarc)
  
  return _sarc_tup, _not_sarc_tup

In [0]:
def _test_naive_bayes(_sarc_tup, _not_sarc_tup, x_test, y_test):
  y_pred_sarc = _np.log10(_np.divide(_np.exp(_np.divide(_np.power(_np.subtract(x_test,_sarc_tup[0]),2),-2*_sarc_tup[1])),_np.sqrt(2*_np.pi*_sarc_tup[1]))).sum(axis=1) + _sarc_tup[2]
  print('y_pred_sarc - divide by zro')
  y_pred_not_sarc = _np.log10(_np.divide(_np.exp(_np.divide(_np.power(_np.subtract(x_test,_not_sarc_tup[0]),2),-2*_not_sarc_tup[1])),_np.sqrt(2*_np.pi*_sarc_tup[1]))).sum(axis=1) + _not_sarc_tup[2]
  
  print(" y pred sarc at loc 44",y_pred_sarc[44])
  print(" y pred not sarc at loc 44",y_pred_not_sarc[44])
  
  acc_list = []
  #for alpha in range(-150,-75):
  for alpha in range(0,1):
  
    how_many_sarc = 0
    how_many_not_sarc = 0

    y_predict = []
    num_right = 0
    for idx,actual_class in enumerate(y_test):
        #if y_pred_sarc[idx] + alpha > y_pred_not_sarc[idx]:
        #if y_pred_sarc[idx] < alpha:
        if y_pred_sarc[idx] > y_pred_not_sarc[idx]:
            how_many_sarc += 1
            y_predict.append(1)
            if actual_class == 1:
                num_right+=1
        else:
            how_many_not_sarc += 1
            y_predict.append(0)
            if actual_class == 0:
                num_right+=1
    
    print("if p(sarc) < ",alpha, end=" acc = ")
    #print(how_many_sarc,'sarcastic predictions','\n',how_many_not_sarc,'not sarcastic predictions')
    acc = num_right/len(y_test)
    print(round(acc*100,3))
    acc_list.append((alpha,acc))
  return acc_list

In [0]:
#x_train_whole = _np.load('gdrive/My Drive/Data/sarcasm_embeddings/x_train_whole_embeddings_not_averaged.npy')
x_train_whole = _np.load('gdrive/My Drive/Data/sarcasm_embeddings/x_train_whole_embeddings.npy')

In [0]:
fold_num=0
skf = StratifiedKFold(n_splits=5, shuffle=True)

acc_list = []

for train_idx, test_idx in skf.split(x_train_whole, y_train_whole):
  print('Fold', str(fold_num+1))
  x_train, y_train, x_test, y_test = separate_fold(x_train_whole, y_train_whole, train_idx, test_idx)
  
  #split into sarcasm/not sarcasm
  #sarc_mat, not_sarc_mat = separate_classes(x_train, y_train)
  
  #naive bayes that shit
  #sarc_tup, not_sarc_tup = _train_naive_bayes(sarc_mat, not_sarc_mat)
  
  #test fold
  #accs = _test_naive_bayes(sarc_tup, not_sarc_tup, x_test, y_test)
  
  
  acc_list = []
  ##IN TEST
#   clf = GaussianNB()
#   clf.fit(x_train, y_train)
#   y_test_predict = clf.predict(x_test)
  clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
  clf.fit(x_train, y_train)
  y_test_predict = clf.predict(x_test)
  
  
  num_right = 0
  for idx,prediction in enumerate(y_test_predict):
    if prediction == y_test[idx]:
      num_right+=1
  acc = num_right/len(y_test)
  acc_list.append(acc)
  print(acc)
  ##IN TEST
  
  #acc_list.append(accs)
  
  fold_num+=1


In [0]:
len(acc_list)

In [0]:
_plt.scatter(*zip(*acc_list))
_plt.show()