In [1]:
import pandas as pd
from sentence_splitter import split_text_into_sentences

def sentences_split(text: str) -> list:
    sentences = split_text_into_sentences(text, language="en")
    return sentences

# dataset downloader from https://www.kaggle.com/c/learn-ai-bbc/data
df = pd.read_csv("data/BBC News Train.csv")

df = df.head(200)

In [3]:
texts = df['Text'].dropna().to_list()
sentences = [sent.split(" ") for text in texts for sent in sentences_split(text)]
print (len(sentences))
print (sentences[1])

201
['german', 'business', 'confidence', 'slides', 'german', 'business', 'confidence', 'fell', 'in', 'february', 'knocking', 'hopes', 'of', 'a', 'speedy', 'recovery', 'in', 'europe', 's', 'largest', 'economy.', 'munich-based', 'research', 'institute', 'ifo', 'said', 'that', 'its', 'confidence', 'index', 'fell', 'to', '95.5', 'in', 'february', 'from', '97.5', 'in', 'january', 'its', 'first', 'decline', 'in', 'three', 'months.', 'the', 'study', 'found', 'that', 'the', 'outlook', 'in', 'both', 'the', 'manufacturing', 'and', 'retail', 'sectors', 'had', 'worsened.', 'observers', 'had', 'been', 'hoping', 'that', 'a', 'more', 'confident', 'business', 'sector', 'would', 'signal', 'that', 'economic', 'activity', 'was', 'picking', 'up.', 'we', 're', 'surprised', 'that', 'the', 'ifo', 'index', 'has', 'taken', 'such', 'a', 'knock', 'said', 'dz', 'bank', 'economist', 'bernd', 'weidensteiner.', 'the', 'main', 'reason', 'is', 'probably', 'that', 'the', 'domestic', 'economy', 'is', 'still', 'weak', 'p

In [4]:
def generate_text(input, seq_length, model):
    seq = input.copy()
    output = input.copy()
    for i in range(seq_length):
        words = model.predict_output_word(input, topn=len(output)+1)
        
        # excluding repetitions
        word = [word[0] for word in words if word[0] not in output][0]

        output.append(word)
        seq.append(word)

        #keeping the last 4 words in the input
        seq.remove(seq[0])

    return " ".join(output)

In [5]:
from gensim.models import Word2Vec

# Size of the word vectors
vector_size = 50

# Minimum count for words to be included in the vocabulary
min_count = 1

# Number of threads to use for training the model
workers = 3

# Context window size
window = 5

# Number of iterations over the corpus
epochs = 100

# Use skip-gram model with negative sampling and softmax activation
sg = 1
ns = 5
alpha = 0.03

basic_w2v = Word2Vec(sentences,vector_size=vector_size, window=window, min_count=min_count, workers=workers, epochs=epochs, sg=sg, negative=ns, alpha=alpha)


In [6]:
text_input = ["the", "new", "home", "secretary"]
seq_length = 10

generate_text(text_input, seq_length, basic_w2v)

'the new home secretary office powers. id broom called policing. association charles sweep mps'

In [9]:
import numpy as np 

# https://datascience.stackexchange.com/questions/97568/fine-tuning-pre-trained-word2vec-model-with-gensim-4-0

finetunded_w2v = Word2Vec(vector_size=vector_size, window=window, min_count=min_count, workers=workers, sg=sg, negative=ns, alpha=alpha)

finetunded_w2v.build_vocab(sentences)

finetunded_w2v.wv.vectors_lockf = np.ones(len(finetunded_w2v.wv))

# download glove.6B.50d.word2vec.txt from https://nlp.stanford.edu/projects/glove/
# convert glove to word2vec format following: https://radimrehurek.com/gensim/scripts/glove2word2vec.html
finetunded_w2v.wv.intersect_word2vec_format('glove.6B.50d.word2vec.txt', binary=False)

finetunded_w2v.train(sentences, total_examples=len(sentences),  epochs=epochs)

(5935504, 7467500)

In [10]:
generate_text(text_input, seq_length, finetunded_w2v)

'the new home secretary zealand. broom office general minister powers. chairman executive stadium. election'