## 1. Load KoBert Model

In [None]:
!pip install ipywidgets  # for vscode
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm

In [None]:
from kobert import get_tokenizer
from kobert import get_pytorch_kobert_model

In [None]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [None]:
## CPU
# device = torch.device("cpu")

## GPU
device = torch.device("cuda:0")

In [None]:
bertmodel, vocab = get_pytorch_kobert_model(cachedir=".cache")

/content/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
/content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]


In [None]:
dataset_test = nlp.data.TSVDataset('/Assets/test_set_50.tsv', field_indices=[0], num_discard_samples=1)

In [None]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        # self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        # return (self.sentences[i] + (self.labels[i], ))
        return self.sentences[i]

    def __len__(self):
        return (len(self.sentences))

In [None]:
## Setting parameters
max_len = 256
batch_size = 16
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
data_test = BERTDataset(dataset_test, 0, tok, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

  cpuset_checked))


In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        else:
            out = pooler
        # return self.classifier(out)
        return out

In [None]:
model = BERTClassifier(bertmodel).to(device)

In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()



In [None]:
import numpy as np
import os

save_path = '/Assets/kobert_output/test_set'

os.makedirs(save_path, exist_ok=True)

X = np.empty((0, 768), dtype=np.float32) # it will become of kmenas trainset, should np.float32

model.eval()
for batch_id, (token_ids, valid_length, segment_ids) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    
    out = model(token_ids, valid_length, segment_ids)

    X = np.append(X, out.cpu().detach().numpy(), axis=0)

    print('{} : {}'.format(batch_id, out.shape))

np.save(os.path.join(save_path, 'test_set_{}.npy'.format(X.shape)), X)

  cpuset_checked))


  0%|          | 0/4 [00:00<?, ?it/s]

0 : torch.Size([16, 768])
1 : torch.Size([16, 768])
2 : torch.Size([16, 768])
3 : torch.Size([2, 768])


## 2. KNN Clustering (Generate csv file)

In [None]:
!pip install faiss-cpu
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
[K     |████████████████████████████████| 8.6 MB 7.6 MB/s 
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[K     |████████████████████████████████| 85.5 MB 170 kB/s 
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
import faiss
import pickle
import os

def exec_knn(data, save_path):
    # D = 768 # 768
    # K = 10000 # The number of clusters
    D = 768
    K = 10 # The number of clusters # TODO
    iter = 100

    # Kmeans Clutering
    kmeans = faiss.Kmeans(d=D, k=K, niter=iter, verbose=True, gpu=True)
    kmeans.train(data)

    cluster_centers_ = kmeans.centroids
    labels_ = kmeans.index.search(x=data, k=1)[1].reshape(-1)

    ## center 에서 가장 가까운 100개의 index, distance 추출
    index = faiss.IndexFlatL2(D)
    index.add(data)
    D, I = index.search(cluster_centers_.astype(np.float32), 100)

    # save
    os.makedirs(save_path, exist_ok=True)
    
    print('Saving Labels ...')
    with open(os.path.join(save_path, '0616_labels_k={}_iter={}.pickle'.format(K, iter)), 'wb') as f:
        pickle.dump(labels_, f, pickle.HIGHEST_PROTOCOL)
    
    print('Saving Cluster Centers ...')
    with open(os.path.join(save_path, '0616_cluster_centers_k={}_iter={}.pickle'.format(K, iter)), 'wb') as f:
        pickle.dump(cluster_centers_, f, pickle.HIGHEST_PROTOCOL)

    # print('Saving Distance ...')
    # with open(os.path.join(save_path, 'D_extract_data_k={}_iter={}_extract=100.pickle'.format(K, iter)), 'wb') as f:
    #     pickle.dump(D, f, pickle.HIGHEST_PROTOCOL)
    
    # print('Saving Index ...')
    # with open(os.path.join(save_path, 'I_extract_data_k={}_iter={}_extract=100.pickle'.format(K, iter)), 'wb') as f:
    #     pickle.dump(I, f, pickle.HIGHEST_PROTOCOL)

In [None]:
import numpy as np
np_path = '/Assets/numpy/0616_final_(365, 768).npy'
data = np.load(np_path)

exec_knn(data=data, save_path='/'.join(np_path.split('/')[:-1]))

Saving Labels ...
Saving Cluster Centers ...


In [None]:
import pandas as pd
from collections import defaultdict
import csv

def get_rest_name(label_path, f_path):
    
    # label load
    with open(label_path, 'rb') as f:
      labels = pickle.load(f)
    labels = labels.tolist()
    
    # tsv file load
    data = pd.read_csv(f_path, sep='\t', index_col = 0) # len : 365
    retrt_nm = data['rstrt_name'].tolist() # len : 365

    retrt_cluster = defaultdict(list)
    for la, re in zip(labels, retrt_nm):
      retrt_cluster[la].append(re)

    for c, retrt_list in zip(list(retrt_cluster.keys()), list(retrt_cluster.values())):
      data = pd.DataFrame({
        c : retrt_list,
      })
      
      save_path = '/'.join(label_path.split('/')[:-1])
      data.to_csv(os.path.join(save_path, 'cluster_no={}.csv'.format(c)))

  

# final
label_path = '/Assets/pickle/0616_labels_k=10_iter=100.pickle'
f_path = '/Assets/0616_final.tsv'

get_rest_name(label_path=label_path, f_path=f_path)

## 3. 여기서부터 test set 반복해서 사용

In [None]:

# 1. test_set embedding vector load
test_set = np.load('/Assets/numpy/test_set_(50, 768).npy')
train_set = np.load('/Assets/numpy/0616_final_(365, 768).npy')

# 2. knn center cluster 와 거리 비교
with open('/Assets/pickle/0616_cluster_centers_k=10_iter=100.pickle', 'rb') as f:
    k_centers = pickle.load(f)

with open('/Assets/pickle/0616_labels_k=10_iter=100.pickle', 'rb') as f:
    label = pickle.load(f)

test_k_ids = []
for test in test_set: # (50, 768), 모든 테스트 셋에 대해서
  distance = []
  for k_center in k_centers:
    dist = np.linalg.norm(k_center-test)
    distance.append(dist)

  idx = distance.index(min(distance))
  test_k_ids.append(idx)

# 3. 해당하는 cluster 중 가장 가까운 5개의 음식점 return
train_df = pd.read_csv('/Assets/0616_final.tsv', sep='\t', index_col = 0)
rstrt_nm = train_df['rstrt_name'].tolist()

test_df = pd.read_csv('/Assets/test_set_50.tsv', sep='\t', index_col = 0)

label_list = label.tolist() 
result_list = []
for e, target_k in enumerate(test_k_ids): # 50
    dist_list = []
    sample_ids = [i for i in range(len(label_list)) if label_list[i]==target_k] # [1, 10, 34, 590]
    for sample_idx in sample_ids:
      dist = np.linalg.norm(test_set[e, :]-train_set[sample_idx, :])
      dist_list.append([sample_idx, dist])

    distances_sort = sorted(dist_list, key=lambda x: x[1])
    distances_sort = distances_sort[:5]

    result = []
    for d in distances_sort:
      result.append(rstrt_nm[d[0]])

    result_list.append(result)

test_df['Recommend list'] = result_list
test_df.to_csv('/Assets/k=10_recommend.tsv', sep="\t")
