In [1]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')


In [2]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)

In [5]:
sentence_embeddings.shape

(3, 384)

In [7]:
from sentence_transformers import util

util.dot_score(sentence_embeddings[0], sentence_embeddings[2])

tensor([[0.1181]])

In [8]:
from sklearn.cluster import KMeans

In [9]:
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]
corpus_embeddings = model.encode(corpus)

In [10]:
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']

Cluster  2
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.']

Cluster  3
['The girl is carrying a baby.', 'The baby is carried by the woman']

Cluster  4
['A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.']

Cluster  5
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']



In [13]:
import argument_embed_vis.example_debates as example_debates

e_corpus = example_debates.EXAMPLE_DEBATE_1.split(".")
corpus_embeddings = model.encode(e_corpus)


In [14]:
corpus_embeddings.shape

(16, 384)

In [20]:
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_


In [22]:

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(e_corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
["Page Jimmy: All the environment benefits can be achieved just by a vegetarian diet, you don't have to go fully vegan\n", 'Plant Robert: Only vegetarian would be good for earth is just a myth', ' You either go fully vegan or you are just kidding yourself', 'Plant Robert: Vegan way of eating and living is the only sustainable (for our planet) lifestyle ']

Cluster  2
['Kirk Hammet: Agree, think of CO2 emissions saved because of less caddle', 'Kirk Hammet: also, less caddle means less incentives for deforestation\n']

Cluster  3
['LucasB: This would have severe positive environmental advantages', 'LucasB: This would be much more ethical ', 'LucasB: Agree, in terms of cost, is an extremely -therefore not sustainable- expensive lifestyle']

Cluster  4
['Kirk Hammet: Veganism is a choice, it should not be forced - especially to children\n', 'Kirk Hammet: If we raise our children with the same values like our generation did, then we are just fuelling the same viscous circle that 

In [36]:
import pandas as pd

In [63]:
for num_clusters in [5]:#,3,4,5,6,8,10,12,16]:
    # num_clusters = 5
    clustering_model = KMeans(n_clusters=num_clusters)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_
    clustered_sentences = [[] for i in range(num_clusters)]

    df = pd.DataFrame(columns=['cluster_id','sentence','sentence_embedding'])

    df["cluster_id"] = cluster_assignment
    df["sentence"] = e_corpus
    df["sentence_embedding"] = corpus_embeddings
    # for sentence_id, cluster_id in enumerate(cluster_assignment):
        # clustered_sentences[cluster_id].append(e_corpus[sentence_id])

    print(cluster_assignment)
    # for i, cluster in enumerate(clustered_sentences):
    #     print("Cluster ", i+1)
    #     print(cluster)
    #     print("")

[3 2 2 2 1 1 2 2 1 3 2 1 4 2 3 0]


In [42]:
from yellowbrick.text import TSNEVisualizer
from sklearn.manifold import TSNE

In [48]:
num_clusters = 3
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
clustered_sentences = [[] for i in range(num_clusters)]

# tsne = TSNEVisualizer()
# tsne.fit(corpus_embeddings, cluster_assignment)
tsne_dims = TSNE(n_components=2, learning_rate='auto',init='random').fit_transform(corpus_embeddings)

df = pd.DataFrame(columns=['cluster_id','sentence','sentence_embedding','tsne_x','tsne_y'])

df["cluster_id"] = cluster_assignment
df["sentence"] = e_corpus
# df["sentence_embedding"] = corpus_embeddings
df["tsne_x"] = tsne_dims[:,0]
df["tsne_y"] = tsne_dims[:,1]

In [76]:
all_viz_data ={}
for num_clusters in [2,3,4,5,6,7,8]:
    print(num_clusters)
    clustering_model = KMeans(n_clusters=num_clusters)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_
    clustered_sentences = [[] for i in range(num_clusters)]

    # tsne = TSNEVisualizer()
    # tsne.fit(corpus_embeddings, cluster_assignment)
    tsne_dims = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(corpus_embeddings)

    df = pd.DataFrame(columns=['cluster_id','sentence',
    # 'sentence_embedding',
    'tsne_x','tsne_y'])

    df["cluster_id"] = cluster_assignment
    df["sentence"] = e_corpus
    # df["sentence_embedding"] = corpus_embeddings[:,:]
    df["tsne_x"] = tsne_dims[:,0]
    df["tsne_y"] = tsne_dims[:,1]

    all_viz_data[num_clusters] = df

2
3
4
5
6
7
8


In [77]:
all_viz_data[2]

Unnamed: 0,cluster_id,sentence,tsne_x,tsne_y
0,0,LucasB: This would have severe positive enviro...,48.414768,-128.079865
1,1,Page Jimmy: All the environment benefits can b...,-33.802704,-48.748253
2,1,Plant Robert: Only vegetarian would be good fo...,-161.982635,72.538116
3,1,You either go fully vegan or you are just kid...,-124.689445,-86.382256
4,0,"Kirk Hammet: Agree, think of CO2 emissions sav...",-115.819435,152.812057
5,0,"Kirk Hammet: also, less caddle means less ince...",113.26664,128.961166
6,1,LucasB: Enforcing what to eat to people is aga...,48.959633,-36.59071
7,1,"Kirk Hammet: Veganism is a choice, it should n...",28.158901,107.445175
8,0,Kirk Hammet: If we raise our children with the...,-47.268261,-136.882111
9,0,LucasB: This would be much more ethical,-59.908504,84.362259


In [78]:
all_viz_data[3]

Unnamed: 0,cluster_id,sentence,tsne_x,tsne_y
0,0,LucasB: This would have severe positive enviro...,5.281987,35.591003
1,2,Page Jimmy: All the environment benefits can b...,36.936951,42.17561
2,2,Plant Robert: Only vegetarian would be good fo...,-3.363128,-51.439823
3,2,You either go fully vegan or you are just kid...,27.12063,-28.565918
4,1,"Kirk Hammet: Agree, think of CO2 emissions sav...",-8.150981,-18.694504
5,1,"Kirk Hammet: also, less caddle means less ince...",14.506747,4.977847
6,0,LucasB: Enforcing what to eat to people is aga...,-16.512831,11.884882
7,1,"Kirk Hammet: Veganism is a choice, it should n...",-50.128983,63.798298
8,1,Kirk Hammet: If we raise our children with the...,-25.656141,43.282726
9,0,LucasB: This would be much more ethical,-39.144489,-11.99685


In [81]:
import pickle
print(pickle.format_version)

4.0


In [80]:
pickle.dump(all_viz_data, open("example_debate_1_processed.pkl",'wb'))

In [82]:
print(pd.__version__)

1.4.1
