In [1]:
import pandas as pd
from sentence_splitter import split_text_into_sentences

def sentences_split(text: str) -> list:
    sentences = split_text_into_sentences(text, language="en")
    return sentences

df = pd.read_csv("../generate_toponym_dataset/1880-1900-LwM-HMD-subsample.csv")

df = df.head(200)

In [2]:
texts = df['text'].dropna().to_list()
sentences = [sent.split(" ") for text in texts for sent in sentences_split(text)]
print (len(sentences))
print (sentences[1])

4830
['CHAPTER', 'A', 'few', 'years', 'before', 'the', 'date', 'on', 'which', 'this', '"narrative', 'opens', 'there', 'had', 'come', 'to', 'Burslem', 'from', 'nother', 'part', 'of', 'the', 'Midlands', 'a', 'young', 'medical', 'gentleman,', 'fresh', 'from', 'his', 'professional', 'studies,', 'and', 'from', 'the', 'cultured', 'teaching', 'of', 'Rugby', 'public', "'school."]


In [3]:
def generate_text(input, seq_length, model):
    seq = input.copy()
    output = input.copy()
    for i in range(seq_length):
        words = model.predict_output_word(input, topn=len(output)+1)
        
        # excluding repetitions
        word = [word[0] for word in words if word[0] not in output][0]

        output.append(word)
        seq.append(word)

        #keeping the last 4 words in the input
        seq.remove(seq[0])

    return output

In [4]:
from gensim.models import Word2Vec

# Size of the word vectors
vector_size = 50

# Minimum count for words to be included in the vocabulary
min_count = 1

# Number of threads to use for training the model
workers = 3

# Context window size
window = 5

# Number of iterations over the corpus
epochs = 100

# Use skip-gram model with negative sampling and softmax activation
sg = 1
ns = 5
alpha = 0.03

basic_w2v = Word2Vec(sentences,vector_size=vector_size, window=window, min_count=min_count, workers=workers, epochs=epochs, sg=sg, negative=ns, alpha=alpha)


In [5]:
text_input = ['The', 'capital', 'of', 'England']
seq_length = 10

generate_text(text_input, seq_length, basic_w2v)

['The',
 'capital',
 'of',
 'England',
 'being',
 'spoke',
 'man',
 'spend',
 'point',
 'has',
 'position',
 'which',
 'had',
 'are']

In [6]:
import numpy as np 

# https://datascience.stackexchange.com/questions/97568/fine-tuning-pre-trained-word2vec-model-with-gensim-4-0

finetunded_w2v = Word2Vec(vector_size=vector_size, window=window, min_count=min_count, workers=workers, sg=sg, negative=ns, alpha=alpha)

finetunded_w2v.build_vocab(sentences)

finetunded_w2v.wv.vectors_lockf = np.ones(len(finetunded_w2v.wv))

finetunded_w2v.wv.intersect_word2vec_format('glove.6B.50d.word2vec.txt', binary=False)

finetunded_w2v.train(sentences, total_examples=len(sentences),  epochs=epochs)

(8797578, 10829900)

In [7]:
generate_text(text_input, seq_length, finetunded_w2v)

['The',
 'capital',
 'of',
 'England',
 'starch',
 '+',
 'duties',
 'south',
 'Cory',
 'reservoirs,',
 'members.',
 'tons;',
 'station',
 'supervision']