In [119]:
import numpy as np

corpus_files = ['/content/pre_processed_corpus/corpus_1.txt',
                '/content/pre_processed_corpus/corpus_2.txt',
                '/content/pre_processed_corpus/corpus_3.txt',
                '/content/pre_processed_corpus/corpus_4.txt']

corpus = ''
for file in corpus_files:
    with open(file, 'r', encoding='utf-8') as f:
        corpus += f.read()

corpus = corpus.replace('\n', ' ').split()
vocab = list(set(corpus))
vocab_size = len(vocab)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for idx, word in enumerate(vocab)}


def cross_entropy_loss(output, target):
    output_probs = np.exp(output) / np.sum(np.exp(output))
    return -np.log(output_probs[target])



In [120]:
class SkipGram:
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.w_in = np.random.rand(vocab_size, embedding_dim)
        self.w_out = np.random.rand(embedding_dim, vocab_size)

    def forward(self, input_word):
        input_idx = word2idx[input_word]
        input_embedding = self.w_in[input_idx]
        output = np.dot(input_embedding, self.w_out)
        return output

    def backward(self, input_word, context_word):
        input_idx = word2idx[input_word]
        context_idx = word2idx[context_word]
        input_embedding = self.w_in[input_idx]
        output_embedding = self.w_out[:, context_idx]
        error = output_embedding - input_embedding
        self.w_in[input_idx] -= 0.01 * error
        self.w_out[:, context_idx] -= 0.01 * error

    def train(self, corpus, window_size):
        for sentence in corpus:
            words = sentence.split()
            for i, word in enumerate(words):
                context_words = words[max(0, i-window_size):i] + words[i+1:min(len(words), i+window_size+1)]
                for context_word in context_words:
                    self.backward(word, context_word)

    def generate_text(self, input_word, num_words):
        input_idx = word2idx[input_word]
        input_embedding = self.w_in[input_idx]
        output = np.dot(input_embedding, self.w_out)
        output_probs = np.exp(output) / np.sum(np.exp(output))
        generated_words = []
        for _ in range(num_words):
            sampled_idx = np.random.choice(range(vocab_size), p=output_probs)
            generated_words.append(idx2word[sampled_idx])
        return ' '.join(generated_words)


In [121]:
model = SkipGram(vocab_size, 100)
model.train(corpus, window_size=5)

In [130]:
def test_model(input_word, num_words):
    generated_text = model.generate_text(input_word, num_words)
    print(f'Generated text: {generated_text}')


test_model('मजदूरों', 4)

Generated text: दिखाते सत्कार खदानों जगहजगह
