<a href="https://colab.research.google.com/github/akashrazza/WordGenerator/blob/main/assiment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gensim
import tensorflow as tf

print('gensim version: \t%s' % gensim.__version__)
print('TensorFlow version: \t%s' % tf.__version__)

gensim version: 	4.3.2
TensorFlow version: 	2.14.0


## Config

In [2]:
import logging

# For displaying gensim logs
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

# Directory with raw txt-files
TEXT_DIR  = '/'

# Directory for saving checkpoint and metadata
MODEL_DIR = '/'

# Word2vec
EMBEDDING_SIZE = 300

## Preprocessing

In [5]:
import os, re, string


def clean_doc(doc):
    """
    Cleaning a document by several methods
    """
    # Lowercase
    doc = doc.lower()
    # Remove numbers
    doc = re.sub(r"[0-9]+", "", doc)
    # Split in tokens
    tokens = doc.split()
    # Remove punctuation
    tokens = [w.translate(str.maketrans('', '', string.punctuation)) for w in tokens]
    # Tokens with less then two characters will be ignored
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)

# f=open("w2v.txt","r")
# print(f.read())
def read_files(path):
    """
    Read in text files
    """
    documents = list()
    tokenize  = lambda x: gensim.utils.simple_preprocess(x)

    # Read in all files in directory
#     if os.path.isdir(path):
#         for filename in os.listdir(path):
#             with open("w2v.txt", encoding='utf-8') as f:
    f=open("w2v.txt","r")
    doc = f.read()
    doc = clean_doc(doc)
    documents.append(tokenize(doc))
    return documents

docs = read_files(TEXT_DIR)
print('Number of documents: %i' % len(docs))

Number of documents: 1


## Training model

In [6]:
model = gensim.models.Word2Vec(docs, min_count=0)

## Saving model

In [7]:
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
model.save(os.path.join(MODEL_DIR,'word2vec'))

## Creating checkpoint and metadata

In [8]:
from tensorboard.plugins import projector

weights     = model.wv.vectors
index_words = model.wv.index_to_key

vocab_size    = weights.shape[0]
embedding_dim = weights.shape[1]

print('Shape of weights:', weights.shape)
print('Vocabulary size: %i' % vocab_size)
print('Embedding size: %i'  % embedding_dim)

with open(os.path.join(MODEL_DIR,'metadata.tsv'), 'w') as f:
    f.writelines("\n".join(index_words))

config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = 'embeddings'
embedding.metadata_path = './metadata.tsv'
projector.visualize_embeddings(MODEL_DIR, config)

tensor_embeddings = tf.Variable(model.wv.vectors, name='embeddings')

checkpoint = tf.compat.v1.train.Saver([tensor_embeddings])
checkpoint_path = checkpoint.save(sess=None, global_step=None, save_path=os.path.join(MODEL_DIR, "model.ckpt"))



Shape of weights: (263, 100)
Vocabulary size: 263
Embedding size: 100


## Example

In [9]:
model.wv.most_similar(positive=['bioinformatics'], topn=10)

[('this', 0.2520006000995636),
 ('some', 0.24385978281497955),
 ('lle', 0.21587562561035156),
 ('it', 0.21149316430091858),
 ('firth', 0.21078528463840485),
 ('biovec', 0.20363563299179077),
 ('number', 0.19869697093963623),
 ('gradually', 0.1941036432981491),
 ('reducing', 0.19298714399337769),
 ('representations', 0.18559524416923523)]