In [None]:
import pandas as pd
import random
import numpy as np
import pickle
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sentence_transformers import SentenceTransformer
train_data = pd.read_csv('data/train_data.csv')
train_data = train_data.sample(frac=1, random_state=42)
train_data = train_data.reset_index(drop=True)
model = SentenceTransformer('all-mpnet-base-v2')
doc_mod = train_data["module"].unique()
batched_mod = [doc_mod[i:i+128] for i in range(0, len(doc_mod), 128)]
from tqdm import tqdm
embeddings_mod = []
for batch in tqdm(batched_mod):
    embeddings_mod.extend(model.encode(batch))
embeddings_mod = np.array(embeddings_mod)
df = pd.DataFrame({
    'docid': list(range(len(doc_mod))),
    'vector': list(embeddings_mod)  
})
old_id = df['docid'].tolist()
X = np.stack(df['vector'].tolist())
args = {"v_dim": 768, "bert_size": 768, "seed": 7, "k": 6, "c": 6}
new_id_list = []

kmeans = KMeans(n_clusters=args.k, max_iter=300, n_init=100, init='k-means++', random_state=args.seed, tol=1e-7)
mini_kmeans = MiniBatchKMeans(n_clusters=args.k, max_iter=300, n_init=100, init='k-means++', random_state=3,
                              batch_size=1000, reassignment_ratio=0.01, max_no_improvement=20, tol=1e-7)

def classify_recursion(x_data_pos):
    if x_data_pos.shape[0] <= args.c:
        if x_data_pos.shape[0] == 1:
            return
        for idx, pos in enumerate(x_data_pos):
            new_id_list[pos].append(idx)
        return

    temp_data = np.zeros((x_data_pos.shape[0], args.v_dim))
    for idx, pos in enumerate(x_data_pos):
        temp_data[idx, :] = X[pos]

    if x_data_pos.shape[0] >= 1e3:
        pred = mini_kmeans.fit_predict(temp_data)
    else:
        pred = kmeans.fit_predict(temp_data)

    for i in range(args.k):
        pos_lists = []
        for id_, class_ in enumerate(pred):
            if class_ == i:
                pos_lists.append(x_data_pos[id_])
                new_id_list[x_data_pos[id_]].append(i)
        classify_recursion(np.array(pos_lists))

    return

pred = mini_kmeans.fit_predict(X)

for class_ in pred:
    new_id_list.append([class_])

for i in range(args.k):
    pos_lists = []
    for id_, class_ in enumerate(pred):
        if class_ == i:
            pos_lists.append(id_)
    classify_recursion(np.array(pos_lists))

mapping = {}
for i in range(len(old_id)):
    mapping[old_id[i]] = new_id_list[i]

with open(f'IDMapping_NQ_bert_{args.bert_size}_k{args.k}_c{args.c}_seed_{args.seed}.pkl', 'wb') as f:
    pickle.dump(mapping, f)

with open(f'IDMapping_NQ_bert_{args.bert_size}_k{args.k}_c{args.c}_seed_{args.seed}.pkl', "rb") as f:
    docid = pickle.load(f)
mod_id = {doc: i for i, doc in enumerate(doc_mod)}
mod_code = {k: np.array(docid[v]) for k, v in mod_id.items()}
print(mod_code)

{'bleach ammonia': array([1, 1, 0, 5, 0]),
 'garden & flora': array([0, 5, 0]),
 'stationery & printed material & services': array([1, 2, 1, 0]),
 'homecare merchandise': array([1, 2, 3, 1]),
 'skin conditioning moisturising': array([1, 4, 5, 1, 0]),
 'wine still light table styles': array([3, 2, 0]),
 'sugar candy': array([3, 0, 0, 0]),
 'snacks chips crisps reconstituted extruded': array([5, 1, 1, 0]),
 'skin cleansing & toning': array([1, 4, 5, 4]),
 'meat products fresh': array([5, 4, 0]),
 'dog food dry': array([5, 5, 0]),
 'meat cuts joints whole fresh fw': array([5, 4, 3]),
 'eggs egg products fresh': array([5, 0, 2, 0]),
 'chocolate single variety': array([5, 2, 2, 0]),
 'cough cold & other respiratory remedies & accessories': array([1, 0, 3, 0]),
 'fruit orange fresh fw': array([5, 3, 0, 0]),
 'cheese fresh fw': array([5, 0, 0, 0]),
 'vegetables salad vegetables remaining varieties ambient': array([0, 5, 1]),
 'milk substitutes non flavoured ambient': array([2, 5, 0, 0]),
 'cl

In [None]:
mod_code_df = pd.DataFrame(mod_code.items(), columns=['module_name', 'code'])
mod_code_df['code'] = mod_code_df['code'].apply(lambda x: ','.join(map(str, x)))
mod_code_df.to_csv('mod_code.csv', index=False)
