#Initial Downloads and imports

In [0]:
# initial imports
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil.parser import parse

import gensim
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
warnings.simplefilter(action='ignore', category=FutureWarning)
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.phrases import Phrases, Phraser

import sys
import csv

csv.field_size_limit(sys.maxsize)

import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# load data
path = '/content/drive/My Drive/Colab Notebooks/csvData/title_content.csv'
ungrouped_text = pd.read_csv(path, encoding='ISO-8859-1', engine='python', error_bad_lines=False)

ungrouped_text.drop('Unnamed: 0', axis=1, inplace=True)

#Word Embedding and Text Vectorisation

##Word2Vec

In [0]:
title_strings = [row for row in ungrouped_text['title']]
content_strings = [row for row in ungrouped_text['content']]

###Word2Vec 1: Pretrained Model

In [0]:
#Preloaded model
w2v_model = KeyedVectors.load_word2vec_format('/content/drive/My Drive/Colab Notebooks/Copy of GoogleNews-vectors-negative300.bin.gz', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


###Word2Vec2: Fine tuned model

In [0]:
title_tokens = [word_tokenize(word) for word in title_strings]
content_tokens = [word_tokenize(word) for word in content_strings]
sentences = title_tokens + content_tokens

In [0]:
# https://radimrehurek.com/gensim/models/word2vec.html
# Word2Vec model trained on own dataset
w2v_model2 = Word2Vec(min_count=1)
w2v_model2.build_vocab(sentences) # prepare the model vocabulary

In [0]:
w2v_model2.save("drive/My Drive/Colab Notebooks/word2vec2.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
w2v_model2.train(sentences, total_examples=w2v_model2.corpus_count, epochs=w2v_model2.iter)  # train word vectors
print(w2v_model2)
words = list(w2v_model2.wv.vocab)
print(words[5:8])
print(w2v_model2['trump'])

In [0]:
# load pretrained model
w2v_model2 = KeyedVectors.load('/content/drive/My Drive/Colab Notebooks/Word2VecData/Models/word2vec2.model', mmap='r')
words = list(w2v_model2.wv.vocab)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


###Word2Vec3: Advanced Bigram Mode, Skim Gram

In [0]:
title_tokens3 = [row.split() for row in ungrouped_text['title']]
combined_tokens = [row.split() for row in ungrouped_text['content']]

for lst in title_tokens3:
  combined_tokens.append(lst)

# content_phrases = Phrases(content_tokens2, min_count=30, progress_per=10000)
phrases = Phrases(combined_tokens, min_count=30, progress_per=10000)

In [0]:
bigram = Phraser(phrases)
#include for titles too?
combined_bigrams = bigram[combined_tokens]

In [0]:
w2v_model3 = Word2Vec(min_count=20,
                     sg=1, hs=1,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)
#sample=0, 1e-5

In [0]:
w2v_model3.build_vocab(combined_bigrams, progress_per=10000)

In [0]:
w2v_model3.train(combined_bigrams, total_examples=w2v_model3.corpus_count, epochs=w2v_model3.iter)  # train word vectors

In [0]:
words = list(w2v_model3.wv.vocab)

In [0]:
w2v_model3.save("drive/My Drive/Colab Notebooks/word2vec3.model")

In [0]:
# load pretrained model
w2v_model3 = KeyedVectors.load('/content/drive/My Drive/Colab Notebooks/Word2VecData/Models/word2vec3.model', mmap='r')
words = list(w2v_model3.wv.vocab)

##Vectorisation

In [0]:
def get_mean_vector(model, words):
  tokens = nltk.word_tokenize(words)
  accepted_tokens = [word for word in tokens if word in model.wv.vocab] #only accept words that are in word2vec vocabular
  vectors = [model[word] for word in accepted_tokens]# list word embeddings for each word in a given document
  if not vectors:
    mean_vector = np.zeros(100) # create a zero vector if the list of embeddings is empty
  else:
    mean_vector = np.mean(vectors, axis = 0) # average all embeddings in the list as a single vector
  return np.array(mean_vector)

  
def vectorize(doc_lst, model):
  vectors = []
  for string in doc_lst:
    vectors.append(get_mean_vector(model, string))
  return np.array(vectors)

In [0]:
def get_word_vectors(model, words):
  tokens = nltk.word_tokenize(words)
  accepted_tokens = [word for word in tokens if word in model.wv.vocab] #only accept words that are in word2vec vocabular
  vectors = [model[word] for word in accepted_tokens]# list word embeddings for each word in a given document
  if not vectors:
    zero_vector = np.zeros(300) # create a zero vector if the list of embeddings is empty
  else:
    vectors = np.array(vectors) # average all embeddings in the list as a single vector
  return vectors

  
def word_vectorize(doc_lst, model):
  vectors = []
  for string in doc_lst:
    vectors.append(get_word_vectors(model, string))
  return np.array(vectors)

In [0]:
# content_strings1 = content_strings[0:10000]
# content_strings2 = content_strings[10000:20000]
# content_strings3 = content_strings[20000:30000]
# content_strings4 = content_strings[30000:40000]
# content_strings5 = content_strings[40000:50000]
# content_strings6 = content_strings[50000:60000]
# content_strings7 = content_strings[60000:70000]
# content_strings8 = content_strings[70000:80000]
# content_strings9 = content_strings[80000:]

In [0]:
#nltk brown

## mean vector for every title and content for each of the 3 wrod2vec models.
## Not saved-> found mean vector of all titles/content for each day and saved in df csv
## saved-> founr mean vector for each title and content and concatenate per date

# title_vectors = vectorize(title_strings, w2v_model)
# content_vectors = vectorize(content_strings, w2v_model)

#   DO:
# title_vectors2 = vectorize(title_strings, w2v_model2)
# np.save("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_titles_doc2", title_vectors2)
# content_vectors2 = vectorize(content_strings, w2v_model2)
# np.save("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_doc2", content_vectors2)

title_vectors3 = vectorize(title_strings, w2v_model3)
np.save("drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_titles_doc3", title_vectors3)
content_vectors3 = vectorize(content_strings, w2v_model3)
np.save("drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_doc3", content_vectors3)

## word vectors for all titles from word2vec 2 model
# title_vectors4 = word_vectorize(title_strings, w2v_model2)
## word vectors for all titles from word2vec 3 model
# title_vectors5 = word_vectorize(title_strings, w2v_model3)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_titles_words", title_vectors4)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_titles_words3", title_vectors5)

## word vectors for all content from word2vec 2 model
# content_vectors_batch1 = word_vectorize(content_strings1, w2v_mode23)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_content_words", content_vectors_batch1)
# content_vectors_batch2 = word_vectorize(content_strings2, w2v_model2)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_content_words", content_vectors_batch2)
# content_vectors_batch3 = word_vectorize(content_strings3, w2v_model2)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_content_words", content_vectors_batch3)
# content_vectors_batch4 = word_vectorize(content_strings4, w2v_model2)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_content_words", content_vectors_batch4)
# content_vectors_batch5 = word_vectorize(content_strings5, w2v_model2)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_content_words", content_vectors_batch5)
# content_vectors_batch6 = word_vectorize(content_strings6, w2v_model2)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_content_words", content_vectors_batch6)
# content_vectors_batch7 = word_vectorize(content_strings7, w2v_model2)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_content_words", content_vectors_batch7)
# content_vectors_batch8 = word_vectorize(content_strings8, w2v_model2)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_content_words", content_vectors_batch8)
# content_vectors_batch9 = word_vectorize(content_strings9, w2v_model2)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_content_words", content_vectors_batch9)

## word vectors for all content from word2vec 3 model
# content_vectors_batch1 = word_vectorize(content_strings1, w2v_model3)
# np.save("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH1", content_vectors_batch1)
# content_vectors_batch2 = word_vectorize(content_strings2, w2v_model3)
# np.save("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH2", content_vectors_batch2)
# content_vectors_batch3 = word_vectorize(content_strings3, w2v_model3)
# np.save("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH3", content_vectors_batch3)
# content_vectors_batch4 = word_vectorize(content_strings4, w2v_model3)
# np.save("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH4", content_vectors_batch4)
# content_vectors_batch5 = word_vectorize(content_strings5, w2v_model3)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_content_words3_BATCH5", content_vectors_batch5)
# content_vectors_batch6 = word_vectorize(content_strings6, w2v_model3)
# np.save("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH6", content_vectors_batch6)
# content_vectors_batch7 = word_vectorize(content_strings7, w2v_model3)
# np.save("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH7", content_vectors_batch7)
# content_vectors_batch8 = word_vectorize(content_strings8, w2v_model3)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_content_words3_BATCH8", content_vectors_batch8)
# content_vectors_batch9 = word_vectorize(content_strings9, w2v_model3)
# np.save("drive/My Drive/Colab Notebooks/wordvec2_content_words3_BATCH9", content_vectors_batch9)

  after removing the cwd from sys.path.


In [0]:
content_vectors3.shape

(88689,)

## Loading and Saving

In [0]:
# b1 = np.load("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH1.npy", allow_pickle=True)
# b2 = np.load("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH2.npy", allow_pickle=True)
# b3 = np.load("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH3.npy", allow_pickle=True)
# b4 = np.load("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH4.npy", allow_pickle=True)
# b5 = np.load("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH5.npy", allow_pickle=True)
# b6 = np.load("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH6.npy", allow_pickle=True)
# b7 = np.load("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH7.npy", allow_pickle=True)
# b8 = np.load("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH8.npy", allow_pickle=True)
# b9 = np.load("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_words3_BATCH9.npy", allow_pickle=True)

titles = np.load("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_titles_doc3.npy", allow_pickle=True)
content = np.load("/content/drive/My Drive/Colab Notebooks/Word2VecData/wordvec2_content_doc3.npy", allow_pickle=True)

In [0]:
# all_batches1 = np.concatenate([b1, b2, b3])
# all_batches2 = np.concatenate([b4, b5, b6])
# all_batches3 = np.concatenate([b7, b8, b9])
# all_batches1 = np.concatenate([all_batches1, all_batches2, all_batches3])

In [0]:
# np.save('/content/drive/My Drive/Colab Notebooks/Word2VecData/words_Batches1-3', all_batches1)
# np.save('/content/drive/My Drive/Colab Notebooks/Word2VecData/words_Batches4-6', all_batches2)
# np.save('/content/drive/My Drive/Colab Notebooks/Word2VecData/words_Batches7-9', all_batches3)

In [0]:
ungrouped_text['title_vectors'] = pd.Series(titles)
ungrouped_text['content_vectors'] = pd.Series(content.tolist())

In [0]:
type(ungrouped_text['title_vectors'][0][0])

numpy.float32

In [0]:
result_df = ungrouped_text.groupby('date')['title_vectors'].apply(list).to_frame('Title')
result_df2 = ungrouped_text.groupby('date')['content_vectors'].apply(list).to_frame('Content')

In [0]:
type(result_df['Title'][0][0][0])

numpy.float32

In [0]:
result_df['Content'] = pd.Series(result_df2['Content'])
result_df = result_df.reset_index().rename(columns={"date" : "Date"})

In [0]:
type(result_df['Title'][0][0][0])

numpy.float32

In [0]:
result_df.to_csv('Vectorised_Docs_Word2Vec3.csv')
!cp Vectorised_Docs_Word2Vec3.csv drive/My\ Drive/Colab\ Notebooks/csvData