In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive/', force_remount=True)
# %cd gdrive/MyDrive/BT4222 Project Group/Codes/Data/Final Data - Test train

Mounted at /content/gdrive/
/content/gdrive/MyDrive/BT4222 Project Group/Codes/Data/Final Data - Test train


In [None]:
!pip install testfixtures
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# imports

import pandas as pd
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing
from tqdm import tqdm, trange
from sklearn import utils
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
import torch
from transformers import AutoTokenizer, AutoModel
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# common functions

def tokenize(preprocessed_text):
  return preprocessed_text.split()

def avg_word2vec(model, model_vocabs, tokenized_posts, size):
  list_of_wv = [[model.wv[token] for token in post if token in model_vocabs] for post in tokenized_posts]
  list_of_avg_wv = []
  for wvs in list_of_wv:
    if len(wvs) > 0:
      list_of_avg_wv.append(wvs.mean(axis=0))
    else:
      list_of_avg_wv.append(np.zeros(size, dtype=float))
  return  np.array(list_of_avg_wv)

def tagged_document(post):
  return TaggedDocument(words=post)

def avg_doc2vec(model, tagged_docs):
  sents = tagged_docs.values
  targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
  return np.array(regressors)

def bert_dist_embed(posts, tokenizer, model):
  embedding_res = np.empty(shape=(0, 768))
  for batch_no in trange(0, len(posts), 100):
    tokenized = tokenizer(list(posts[batch_no:batch_no+100]), 
                          padding = True, 
                          truncation = True, 
                          return_tensors = "pt")
    with torch.no_grad():
      hidden = model(**tokenized)
      
    batch = hidden.last_hidden_state[:,0,:].cpu().detach().numpy()
    embedding_res = np.append(embedding_res, batch, axis=0)
  return embedding_res

In [None]:
# tokenized posts for training and testing

train_posts = pd.read_csv('../Data/X_train.csv')['processed_str'].apply(tokenize).to_numpy()
test_posts = pd.read_csv('../Data/X_test.csv')['processed_str'].apply(tokenize).to_numpy()
train_labels = pd.read_csv('../Data/y_train.csv')['class'].to_numpy()
test_labels = pd.read_csv('../Data/y_test.csv')['class'].to_numpy()

In [None]:
%cd ../..

/content/gdrive/MyDrive/BT4222 Project Group/Codes


### 1. Trained Word2Vec
*   taking the sum of the word vectors of all words in a post
*   variations: Skip-Gram, CBOW

In [None]:
# implement skip and cbow model
skip_gram_model = Word2Vec(train_posts, sg = 1, min_count=1)
cbow_model = Word2Vec(train_posts, sg = 0, min_count=1)

print(skip_gram_model)
print(cbow_model)

Word2Vec(vocab=53100, size=100, alpha=0.025)
Word2Vec(vocab=53100, size=100, alpha=0.025)


In [None]:
# word embedding using skip gram model
skipgram_wordpool = set(skip_gram_model.wv.index2word)
X_train_sg = avg_word2vec(skip_gram_model, skipgram_wordpool, train_posts, 100)
X_test_sg = avg_word2vec(skip_gram_model, skipgram_wordpool, test_posts, 100)


# word embedding using cbow model
cbow_wordpool = set(cbow_model.wv.index2word)
X_train_cbow = avg_word2vec(cbow_model, cbow_wordpool, train_posts, 100)
X_test_cbow = avg_word2vec(cbow_model, cbow_wordpool, test_posts, 100)

In [None]:
print(X_train_sg.shape, X_test_sg.shape, X_train_cbow.shape, X_test_cbow.shape)

(110248, 100) (27562, 100) (110248, 100) (27562, 100)


In [None]:
# export to csv
pd.DataFrame(X_train_sg).to_csv('../Word Embedding/emb_sg_train.csv', index=False)
pd.DataFrame(X_test_sg).to_csv('../Word Embedding/emb_sg_test.csv', index=False)
pd.DataFrame(X_train_cbow).to_csv('../Word Embedding/emb_cbow_train.csv', index=False)
pd.DataFrame(X_test_cbow).to_csv('../Word Embedding/emb_cbow_test.csv', index=False)

### 2. Pretrained Google's Word2Vec

In [None]:
# implement google's word2vec
filename = '../Pretrained Embedding Models/GoogleNews-vectors-negative300.bin'
google_model = KeyedVectors.load_word2vec_format(filename, binary=True) # each word will be represented as a vector of 300 numbers

print(google_model.vector_size)

300


In [None]:
# word embedding using google's word2vec model
google_wordpool = set(google_model.index2word)
X_train_ggl = avg_word2vec(google_model, google_wordpool, train_posts, 300)
X_test_ggl = avg_word2vec(google_model, google_wordpool, test_posts, 300)

  import sys


In [None]:
print(X_train_ggl.shape, X_test_ggl.shape)

(110248, 300) (27562, 300)


In [None]:
# export to csv
pd.DataFrame(X_train_ggl).to_csv('../Word Embedding/emb_ggl_train.csv', index=False)
pd.DataFrame(X_test_ggl).to_csv('.../Word Embedding/emb_ggl_test.csv', index=False)

### 3. Stanford's GloVe

In [None]:
# implement glove
glove_input_file_6B = '../Pretrained Embedding Models/glove.6B.100d.txt'
word2vec_output_file = '../Pretrained Embedding Models/glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file_6B, word2vec_output_file)
glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

print(glove_model.vector_size)

100


In [None]:
# word embedding using glove
glove_wordpool = set(glove_model.index2word)
X_train_glove = avg_word2vec(glove_model, glove_wordpool, train_posts, 100)
X_test_glove = avg_word2vec(glove_model, glove_wordpool, test_posts, 100)

  import sys


In [None]:
print(X_train_glove.shape, X_test_glove.shape)

(110248, 100) (27562, 100)


In [None]:
# export to csv
pd.DataFrame(X_train_glove).to_csv('../Word Embedding/emb_glove_train.csv', index=False)
pd.DataFrame(X_test_glove).to_csv('../Word Embedding/emb_glove_test.csv', index=False)

### 4. Trained Doc2Vec
*   Doc2vec is an NLP tool for representing documents as a vector and is a generalizing of the word2vec method



In [None]:
# represent each sentence as a TaggedDocument containing 2 parameters, words=tokenized_sentence and tag=label
train = pd.DataFrame({'label': train_labels, 'post': list(train_posts)}, columns=['label', 'post'])
test = pd.DataFrame({'label': test_labels, 'post': list(test_posts)}, columns=['label', 'post'])
train_tagged = train.apply(lambda r: TaggedDocument(words=r['post'], tags=[r['label']]), axis=1)
test_tagged = test.apply(lambda r: TaggedDocument(words=r['post'], tags=[r['label']]), axis=1)

# visualize a TaggedDocument
train_tagged[0]


TaggedDocument(words=['donna', 'dress', 'cut', 'waldo', 'next', 'halloween', 'maybe', 'time', 'hell', 'able', 'find', 'gilt'], tags=[0])

In [None]:
# use multiple cores
cores = multiprocessing.cpu_count()

# implement Distributed Bag of Words (DBOW) (similar concept to skip-gram)
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

# implement Distributed Memory (DM) (similar concept to CBOW)
model_dm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dm.build_vocab([x for x in tqdm(train_tagged.values)])
for epoch in range(30):
    model_dm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dm.alpha -= 0.002
    model_dm.min_alpha = model_dm.alpha

100%|██████████| 110248/110248 [00:00<00:00, 3230973.02it/s]
100%|██████████| 110248/110248 [00:00<00:00, 2914071.62it/s]
100%|██████████| 110248/110248 [00:00<00:00, 2619788.49it/s]
100%|██████████| 110248/110248 [00:00<00:00, 2806878.76it/s]
100%|██████████| 110248/110248 [00:00<00:00, 3049799.35it/s]
100%|██████████| 110248/110248 [00:00<00:00, 2983448.46it/s]
100%|██████████| 110248/110248 [00:00<00:00, 2895931.33it/s]
100%|██████████| 110248/110248 [00:00<00:00, 2901382.43it/s]
100%|██████████| 110248/110248 [00:00<00:00, 2386971.30it/s]
100%|██████████| 110248/110248 [00:00<00:00, 2987361.12it/s]
100%|██████████| 110248/110248 [00:00<00:00, 3049095.50it/s]
100%|██████████| 110248/110248 [00:00<00:00, 2909432.90it/s]
100%|██████████| 110248/110248 [00:00<00:00, 3414839.25it/s]
100%|██████████| 110248/110248 [00:00<00:00, 2632840.23it/s]
100%|██████████| 110248/110248 [00:00<00:00, 2755233.70it/s]
100%|██████████| 110248/110248 [00:00<00:00, 2922174.30it/s]
100%|██████████| 110248/

In [None]:
# word embedding using dbow
X_train_dbow = avg_doc2vec(model_dbow, train_tagged)
X_test_dbow = avg_doc2vec(model_dbow, test_tagged)

# word embedding using dm
X_train_dm = avg_doc2vec(model_dm, train_tagged)
X_test_dm = avg_doc2vec(model_dm, test_tagged)

# word embedding by combining a paragraph vector from DBOW and DM to improve performance
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dbow_dm = ConcatenatedDoc2Vec([model_dbow, model_dm])
X_train_dbow_dm = avg_doc2vec(model_dbow_dm, train_tagged)
X_test_dbow_dm = avg_doc2vec(model_dbow_dm, test_tagged)

In [None]:
print(X_train_dbow.shape, X_test_dbow.shape, X_train_dm.shape, X_test_dm.shape, X_train_dbow_dm.shape, X_test_dbow_dm.shape)

(110248, 300) (27562, 300) (110248, 300) (27562, 300) (110248, 600) (27562, 600)


In [None]:
# export to csv
pd.DataFrame(X_train_dbow).to_csv('../Word Embedding/emb_dbow_train.csv', index=False)
pd.DataFrame(X_test_dbow).to_csv('../Word Embedding/emb_dbow_test.csv', index=False)
pd.DataFrame(X_train_dm).to_csv('../Word Embedding/emb_dm_train.csv', index=False)
pd.DataFrame(X_test_dm).to_csv('../Word Embedding/emb_dm_test.csv', index=False)
pd.DataFrame(X_train_dbow_dm).to_csv('../Word Embedding/emb_dbow_dm_train.csv', index=False)
pd.DataFrame(X_test_dbow_dm).to_csv('../Word Embedding/emb_dbow_dm_test.csv', index=False)

### 5. Embedding with BERT pre-trained (BERT Distilled)

In [None]:
train_untokenized_posts = pd.read_csv('../Data/X_train.csv')['processed_str'].to_numpy()
test_untokenized_posts = pd.read_csv('../Data/X_test.csv')['processed_str'].to_numpy()

In [None]:
# specify GPU device
device = torch.device("cuda")

# implement bert distilled model
bert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
bert_dist_model = AutoModel.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# word embedding using bert distilled model
X_train_bert = bert_dist_embed(train_untokenized_posts, bert_tokenizer, bert_dist_model)
X_test_bert = bert_dist_embed(test_untokenized_posts, bert_tokenizer, bert_dist_model)

In [None]:
print(X_train_bert.shape, X_test_bert.shape)

In [None]:
# export to csv
pd.DataFrame(X_train_bert).to_csv('../Word Embedding/emb_bert_train.csv', index=False)
pd.DataFrame(X_test_bert).to_csv('../Word Embedding/emb_bert_test.csv', index=False)

### 6. TF-IDF with Bigram

In [None]:
train_untokenized_posts = pd.read_csv('../Data/X_train.csv')['processed_str'].to_numpy()
test_untokenized_posts = pd.read_csv('../Data/X_test.csv')['processed_str'].to_numpy()

In [None]:
vectorizer = TfidfVectorizer(ngram_range = (2, 2))
X_train_tfidf = vectorizer.fit_transform(train_untokenized_posts)
X_test_tfidf = vectorizer.transform(test_untokenized_posts)

In [None]:
print(X_train_tfidf.shape, X_test_tfidf.shape)

(110248, 947656) (27562, 337260)


In [None]:
# export to csv
pd.DataFrame(X_train_tfidf).to_csv('../Word Embedding/emb_tfidf_train.csv', index=False)
pd.DataFrame(X_train_tfidf).to_csv('.,/Word Embedding/emb_tfidf_test.csv', index=False)