In [29]:
from gensim.models import KeyedVectors
import csv
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

In [28]:
# Load local glove file - 50d, 100d, 200d, 300d - to word2vec format
glove_file = 'glove/glove.6B.50d.txt'
wv_base = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [30]:
# Read keywords from another file
keywords = list()
with open('keywords_new_train.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        keywords.extend(row)

In [31]:
# Exclude duplicates
keywords = list(set(keywords))

In [32]:
# a way to average embeddings.
# if a word can't be found in the GloVe model it is excluded
def calculate_embeddings(keyword):
    global wv_base
    individual = keyword.split(' ')
    embs_added = np.zeros(50)
    if individual[0] in wv_base:
        embs_added = wv_base[individual[0]].copy()
    for word in individual[1:]:
        if word in wv_base:
            embs_added += wv_base[word].copy()
    return embs_added / len(individual)

In [33]:
# Embeddings for K-means clustering
X = list()
for kw in keywords:
    embedding = calculate_embeddings(kw)
    X.append(embedding)

In [34]:
# Sanity check
for embedding in X:
    assert len(embedding) == 50

In [35]:
# Fit embeddings to clustering model
kmeans = KMeans(n_clusters=5, random_state=0).fit(X)

In [36]:
# Another sanity check
labels = list(kmeans.labels_)
assert len(labels) == len(keywords)

In [37]:
groups = dict()
for label, keyword in zip(labels, keywords):
    if label not in groups:
        groups[label] = list()
    groups[label].append(keyword)

In [38]:
max_len = 0
for v in groups.values():
    if len(v) > max_len:
        max_len = len(v)

In [39]:
# Pad to max
for k, v in groups.items():
    v += [''] * (max_len - len(v))
    groups[k] = v

In [40]:
df = pd.DataFrame({k:v for k,v in groups.items()})
df = df.reindex(sorted(df.columns), axis=1)
df.to_csv('out_new_train.csv', index=False) 