# word2vec Tutorial

In [None]:
import pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
plt.style.use('ggplot')

In [None]:
# get imdb dataset
from datasets import load_dataset
imdb = load_dataset("imdb")

# example of a review
imdb["test"][0]

In [None]:
# the dataset is already split into train and test
train = imdb["train"]
test = imdb["test"]

In [None]:
# convert the reviews into word lists
train_reviews = [train[i]["text"] for i in range(len(train))]
test_reviews = [test[i]["text"] for i in range(len(test))]

In [None]:
# train word2vec model
model = Word2Vec(train_reviews, min_count=1)

In [None]:
# save and load the model
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")

In [None]:
# test the model
model.wv.most_similar("good")

### Visualize the embeddings

In [None]:
# Get the vocabulary to create a t-SNE plot
vocab = list(model.wv.vocab)
X = model[vocab]

# reduce the dimensionality of the word vectors
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

# create a dataframe with the word vectors
df = pd.DataFrame(X_tsne, index=vocab, columns=["x", "y"])



In [None]:
# plot the word vectors
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(df["x"], df["y"])

# add the words as annotations
for word, pos in df.iterrows():
    ax.annotate(word, pos)

# save the plot
plt.savefig("word2vec.png")
plt.show()


In [None]:
# use seaborn to plot the word vectors with the label as hue
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(1, 1, 1)
sns.scatterplot(data=df, x="x", y="y", hue=df.index, legend=False, ax=ax)

