Get k value

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np

# Prepare the data
# list of text documents
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(train_data)

# Determine the optimal value of k using elbow method

# Determine the optimal value of k using silhouette method
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X)
    silhouette_scores.append(silhouette_score(X, labels))
plt.plot(range(2, 11), silhouette_scores)
plt.title('Silhouette Method')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.show()


modify to K clustering

In [None]:
from sklearn.cluster import KMeans
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.key_to_index: # no longer .vocab
        # tokens.append(model[word])
        tokens.append(model.wv.get_vector(word))
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(np.array(tokens))

    k = 6
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(new_values)
    cmap = plt.get_cmap('Set1')  # choose a colormap
    colors = [cmap(i) for i in np.linspace(0, 1, k)]
    label_colors = [colors[i] for i in labels]

    
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    for i in range(k):
        plt.scatter(new_values[labels == i, 0], new_values[labels == i, 1], color=colors[i], label='Cluster {}'.format(i))

GloVe

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

#https://nlp.stanford.edu/projects/glove/
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

word_to_vec_map = {}
for word in model.vocab:
    word_to_vec_map[word] = model[word]

import numpy as np

m = len(corpus) # number of sentences in the corpus
n = model.vector_size # dimension of the GloVe vectors

X = np.zeros((m,n))
for i in range(m):
    words = corpus[i]
    for w in words:
        if w in word_to_vec_map:
            X[i] += word_to_vec_map[w]
    X[i] /= len(words)

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

plt.scatter(X_tsne[:,0], X_tsne[:,1])
plt.show()

ELMo

In [None]:
from allennlp.modules.elmo import Elmo, batch_to_ids
corpus = build_corpus(data)        

# Set the paths to the ELMo model files
options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
# https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5
weight_file = "/Users/admin/Downloads/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

# Create an Elmo object
elmo = Elmo(options_file, weight_file, 1, dropout=0)

# Convert the sentences to character IDs
character_ids = batch_to_ids(corpus)

# Get the ELMo embeddings for the sentences
elmo_embeddings = elmo(character_ids)

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Get the word embeddings for a list of words
words = ['word1', 'word2', 'word3']
word_embeddings = [elmo_embeddings['elmo_representations'][0][i].detach().numpy() for i in range(len(words))]

# Create a TSNE object
tsne = TSNE(n_components=2)

# Fit and transform the word embeddings using t-SNE
word_tsne = tsne.fit_transform(word_embeddings)

# Plot the t-SNE results
plt.scatter(word_tsne[:, 0], word_tsne[:, 1])
for i, word in enumerate(words):
    plt.annotate(word, xy=(word_tsne[i, 0], word_tsne[i, 1]))
plt.show()

BERT

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Convert the sentences to BERT input format
input_ids = [tokenizer.encode(sentence, add_special_tokens=True) for sentence in corpus]
input_ids = torch.tensor(input_ids)

# Get the BERT embeddings for the sentences
with torch.no_grad():
    bert_embeddings = model(input_ids)
    
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Get the word embeddings for a list of words (Corpus)
words = ['word1', 'word2', 'word3']
word_embeddings = [bert_embeddings[0][i][0].detach().numpy() for i in range(len(words))]

# Create a TSNE object
tsne = TSNE(n_components=2)

# Fit and transform the word embeddings using t-SNE
word_tsne = tsne.fit_transform(word_embeddings)

# Plot the t-SNE results
plt.scatter(word_tsne[:, 0], word_tsne[:, 1])
for i, word in enumerate(words):
    plt.annotate(word, xy=(word_tsne[i, 0], word_tsne[i, 1]))
plt.show()