In [1]:
import numpy as np
import pandas as pd
import torch
import re
from model import *
torch.set_printoptions(precision=8)

In [2]:
# seed 고정하기
import random
import os
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
seed_everything()

### Dataset 구성

In [3]:
import copy
path = '../Transformer'

def make_processed_data(path):
    '''
    기업명과 사업자번호를 key로 사용하려고 했으나, 하나의 기업명에 대해 여러개의 사업자번호가 맵핑되어있는 경우가 발생
    따라서 하나의 기업을 의미하면, 하나의 key로 맵핑될 수 있도록 처리
    '''
    data_path = path + '/data'
    df_biz = pd.read_csv(data_path + '/patent_companies.tsv', sep = '\t', parse_dates=['출원일자'])
    number2embedding = dict(zip(np.load(data_path + '/number.npy', allow_pickle = True), np.load(data_path + '/embedding.npy')))
    df_biz['embedding'] = df_biz['출원번호'].map(number2embedding)
    df_biz['사업자번호'] = df_biz['사업자번호'].apply(lambda x: re.sub('-','',x))
    biz2common = pd.read_table('./biz2common.tsv',dtype={'사업자번호':'object','index':'object'})
    biznum2idx = dict(zip(biz2common['사업자번호'],biz2common['index']))
    df_biz['key'] = df_biz['사업자번호'].map(biznum2idx)
    df_biz['key'] = df_biz['key'].astype(object)
    
    df_itr = pd.read_csv('./mna_checking_patent.csv', parse_dates=['contract_date'])
    df_itr =  df_itr[df_itr['contract_date'].notna()]
    return df_biz, df_itr

df, df_itr = make_processed_data(path)

### interaction이 발생한 회사들 기준으로 train, test corp split

In [4]:
import pandas as pd

# target 기업 중 interaction 이전에 특허가 없는 경우 -> index=468
df_itr = df_itr.drop(index=468)
drop_key = ['12526', '96244', '96244', '111569', '94218', '69689', '66197', '96244']
drop_key = set(map(lambda x:int(x),drop_key))
for key in drop_key:
    idx = df_itr[(df_itr['corp_key'].isin(drop_key)) | (df_itr['partner_key'].isin(drop_key))].index
    df_itr = df_itr.drop(index=idx,axis=0)

In [5]:
def compare_corp(df_patent, df_itr):
    '''
    의뢰기업, 대상기업의 최근 출원일자 기준으로 구분
    '''
    key_dates = df_patent.groupby('key')['출원일자'].max()

    df_itr_copy = df_itr.copy()
    df_itr_copy['corp_key'] = df_itr_copy['corp_key'].astype(str)
    df_itr_copy['partner_key'] = df_itr_copy['partner_key'].astype(str)
    df_itr_copy['client_key'] = df_itr_copy['corp_key']
    df_itr_copy['target_key'] = df_itr_copy['partner_key']

    key_dates_df = key_dates.to_frame('max_date').reset_index()
    df_itr_copy = df_itr_copy.merge(key_dates_df, left_on='corp_key', right_on='key')
    df_itr_copy = df_itr_copy.merge(key_dates_df.rename(columns={'max_date': 'partner_max_date'}), left_on='partner_key', right_on='key')

    client_mask = df_itr_copy['max_date'] >= df_itr_copy['partner_max_date']
    df_itr_copy.loc[client_mask, 'client_key'] = df_itr_copy.loc[client_mask, 'corp_key'].astype(str)
    df_itr_copy.loc[~client_mask, 'client_key'] = df_itr_copy.loc[~client_mask, 'partner_key'].astype(str)
    df_itr_copy.loc[client_mask, 'target_key'] = df_itr_copy.loc[client_mask, 'partner_key'].astype(str)
    df_itr_copy.loc[~client_mask, 'target_key'] = df_itr_copy.loc[~client_mask, 'corp_key'].astype(str)
    
#     client_mask = df_itr_copy['max_date'] >= df_itr_copy['partner_max_date']
#     df_itr_copy['client_key'] = df_itr_copy['corp_key'].astype(str)
#     df_itr_copy['target_key'] = df_itr_copy['partner_key'].astype(str)
#     df_itr_copy.loc[~client_mask, ['client_key', 'target_key']] = df_itr_copy.loc[~client_mask, ['partner_key', 'corp_key']].astype(str)
    return df_itr_copy


def filter_by_contract_date(df_patent, df_itr, key):
    '''
    interaction 발생일자 이전의 데이터 추출
    '''
    filtered_df_patent = pd.DataFrame()
    not_exist_patent = []
    for index, row in df_itr.iterrows():
        contract_date = row['contract_date']
        if key == 'client_key':
            corp_key = row['client_key']
            target_key = row['target_key']
        else:
            corp_key = row['target_key']
            target_key = row['client_key']
        filtered_data = df_patent[(df_patent['출원일자'] <= contract_date) & (df_patent['key'] == corp_key)].sort_values(by='출원일자', ascending=False)
        if filtered_data.empty:
            not_exist_patent.append((corp_key, target_key))
        else:
            filtered_data['identifier'] = str(index)
            filtered_df_patent = pd.concat([filtered_df_patent, filtered_data])
    return filtered_df_patent, not_exist_patent

def create_by_contract_date(df_patent, df_itr, key_pairs, not_exist_patent):
    result = []

    target_keys = key_pairs['target_key']
    df_patent_filtered = df_patent[df_patent['key'].isin(target_keys)]
    df_patent_sorted = df_patent_filtered.sort_values(by='출원일자', ascending=False)

    for _, key_pair in key_pairs.iterrows():
        client_key = key_pair['client_key']
        target_key = key_pair['target_key']
        if ((client_key, target_key) in not_exist_patent) or ((target_key, client_key) in not_exist_patent):
            continue
        filtered_data = df_patent_sorted[df_patent_sorted['key'] == target_key]
        if not filtered_data.empty:
            last_data = filtered_data.iloc[0]
            result.append(last_data['embedding'])

    return result


def interaction_patent(df_patent, df_itr):
    '''
    client_key : 의뢰기업 key
    target_key : 대상기업 key
    '''
    df_itr = df_itr[df_itr['corp_patent'] & df_itr['partner_patent']]
    df_itr = compare_corp(df_patent, df_itr)
    client_key = df_itr['client_key']
    target_key = df_itr['target_key']
    concat_key = pd.concat([client_key,target_key])
    key_pairs = pd.concat([client_key, target_key], axis=1)
    
    df_train = df_patent[~df_patent['key'].isin(concat_key)]
    
    temp = df_patent[df_patent['key'].isin(client_key)]
    df_test, not_exist_patent = filter_by_contract_date(temp, df_itr, 'client_key')
    temp_patent, _ = filter_by_contract_date(df_patent, df_itr, 'target_key')
    patent_array = create_by_contract_date(temp_patent, df_itr, key_pairs, not_exist_patent)
    
    return df_train, df_test, patent_array
    
df_train, df_test, patent_array = interaction_patent(df, df_itr)

### develop 단계에서만 필요 -> file: dataset / model / train / annoy / inference / measure   .py로 작업 

In [6]:
import copy
def preprocess_dataset(df_org,mode):
    '''
    df_above: 기업에서 낸 특허가 2개 이상인 경우
    df_under : 기업에서 낸 특허가 2개 이하인 경우, 기업에서 낸 마지막 특허를 바로 ANN 알고리즘으로 찾기 
    '''
    df = copy.deepcopy(df_org)
    # df = df[df['출원일자'].notna()]
    col = 'key' if mode == 'train' else 'identifier'
    df_above,df_under = ck_above_min(df,col)
    df_under = df_under.drop_duplicates(subset=col,keep='last')
    count_table, count_dict,key2idx, idx2key = make_idx_map(df_above,col)
    df_above = pd.merge(df_above,count_table[[col,'index']],on=col)
    df_above = sort_by_filing_date(df_above,col)
    return df_above,df_under,count_table,count_dict,key2idx,idx2key

# sequential성을 따지기 위해서 회사가 보유한 특허가 3개 이상은 되어야 함. -> 그래야 sequential한 모델링 가능 
def ck_above_min(df,col):
    temp = df[col].value_counts().reset_index()
    temp['above_min'] = temp['count'] >= 2
    df_count = pd.merge(df,temp[[col,'above_min']],on=col,how='inner')
    df_above = df_count[df_count['above_min']==True].drop('above_min',axis=1)
    df_under = df_count[df_count['above_min']==False].drop('above_min',axis=1)
    return df_above,df_under

def make_idx_map(df,col):
    count_table = df[col].value_counts().reset_index().reset_index()
    count_dict = dict(zip(count_table['index'],count_table['count']))
    key2idx = dict(zip(count_table[col],count_table['index']))
    idx2key = dict(zip(count_table['index'],count_table[col]))
    return count_table, count_dict,key2idx, idx2key
    
def sort_by_filing_date(df,col):
    return df.groupby(col).apply(lambda x:x.sort_values(by='출원일자'))


In [7]:
df_biz,df_under,count_table, count_dict, key2idx,idx2key = preprocess_dataset(df_train,'train')
df_biz_test,df_under_test,count_table_test,count_dict_test,key2idx_test,idx2key_test = preprocess_dataset(df_test,'test')

### Sequential하게 구성하기

In [8]:
# setting
def make_args(count_dict):
    import argparse

    # default args
    args = argparse.Namespace(
        batch_size=16,
        d_embed=768,
        max_len = 1024,
        initializer_range = 0.02,
        num_epochs = 5,
        num_heads = 4,
        num_layers = 2,
        dropout_rate = 0.2,
        lr = 0.005,
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
        num_items = len(count_dict)
        annoy_path = './model_pt/patent_ann.annoy',
        n_trees = 20,
        n = 100
    )
    return args
def make_seq_main(args,df_biz,count_dict):
    def make_corp_seq(args,df):
        embedding_dict = {} 
        for idx,cnt in count_dict.items():
            seq = cnt
            part_sequence = []
            if seq >= args.max_len:
                corp_seq = df[df['index']== idx]['embedding'][-(args.max_len + 2) : ]
            else:
                corp_seq = df[df['index']== idx]['embedding']
            for i in range(len(corp_seq)):
                part_sequence.append(corp_seq[i])
            embedding_matrix = np.array(part_sequence).reshape(-1,args.d_embed)
            embedding_dict[idx] = embedding_matrix
        return embedding_dict
    corp_seq = make_corp_seq(args,df_biz)
    return corp_seq
corp_seq = make_seq_main(make_args(count_dict),df_biz,count_dict)

In [9]:
args = make_args(count_dict)

In [11]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

train_dataset = Dataset(args, corp_seq, data_type="train")
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset, sampler=train_sampler, batch_size=args.batch_size
)

eval_dataset = Dataset(args, corp_seq, data_type="valid")
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(
    eval_dataset, sampler=eval_sampler, batch_size=args.batch_size
)


In [12]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

### 불러와서 사용

In [52]:
import torch,math
class TransformerLoss(torch.nn.Module):
    def __init__(self, args):
        super(TransformerLoss, self).__init__()
        self.hidden_size = args.d_embed
        self.max_seq_length = args.max_len

    def forward(self, output, target, mask):
        
        output_emb = output.view(-1, self.hidden_size)
        target_emb = target.view(-1, self.hidden_size)

        output_norm = torch.norm(output_emb, dim=-1)
        target_norm = torch.norm(target_emb, dim=-1)
        
        output_norm = output_norm.clamp(min=1e-10)
        target_norm = target_norm.clamp(min=1e-10)
        
        logits = torch.sum(target_emb * output_emb, -1) / (output_norm * target_norm)
        angular_distance = 1 - torch.arccos(logits) / torch.tensor(math.pi)
        istarget = (mask > 0).view(mask.size(0) * self.max_seq_length).float()
        # print(((logits) + 1e-24) * istarget)
        loss = torch.sum(-torch.log((logits) + 1e-24) * istarget) / torch.sum(istarget)
        return loss


In [54]:
class Trainer:
    def __init__(
        self,
        model,
        train_dataloader,
        eval_dataloader,
        args,
    ):

        self.args = args
        self.device = args.device
        self.lr = args.lr
        self.num_epochs = args.num_epochs
        self.model = model

        # Setting the train and test data loader
        self.train_dataloader = train_dataloader
        self.eval_dataloader = eval_dataloader
        
        self.criterion = TransformerLoss(self.args)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)


    def train(self, epoch):
        self.iteration(epoch, self.train_dataloader)

    def valid(self, epoch):
        return self.iteration(epoch, self.eval_dataloader, mode="valid")
    
    def save(self, file_name):
        torch.save(self.model.cpu().state_dict(), file_name)
        self.model.to(self.device)

    def load(self, file_name):
        self.model.load_state_dict(torch.load(file_name))
        
    def iteration(self, epoch, dataloader, mode="train"):

        # Setting the tqdm progress bar
        
        tbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{self.num_epochs}")
        if mode == "train":
            self.model.train()
            total_loss = 0.0
            for corp_id, mask, input_seq, target_pos, _ in tbar: ##
                mask = mask.to(self.device)
                input_seq, target_pos = input_seq.to(self.device), target_pos.to(self.device)
                output = self.model(input_seq, mask=mask)
                self.optimizer.zero_grad()
                loss = self.criterion(output, target_pos, mask) ##
                total_loss += loss.item()
                loss.backward()
                self.optimizer.step()
                
            avg_loss = total_loss / len(dataloader)
            print(f"Epoch {epoch+1}/{self.num_epochs}, Train Loss: {avg_loss:.4f}")
            self.save(f'./model_pt/model_state_dict_epoch{epoch}.pt') 

        else:
            self.model.eval()
            total_loss = 0.0
            
            for corp_id, mask, input_seq, target_pos, _ in tbar: ##
                mask = mask.to(self.device)
                input_seq, target_pos = input_seq.to(self.device), target_pos.to(self.device)
                output = self.model(input_seq, mask=mask)
                loss = self.criterion(output, target_pos, mask) ##
                total_loss += loss.item()
                
            avg_loss = total_loss / len(dataloader)
            print(f"Epoch {epoch+1}/{self.num_epochs}, Valid Loss: {avg_loss:.4f}")
        
        

In [None]:
from model import *
# setting
args = make_args(count_dict)
model = setting_model(args)

trainer = Trainer(model,
        train_dataloader,
        eval_dataloader,
        args)

for epoch in range(args.num_epochs):
    trainer.train(epoch)
    trainer.valid(epoch)

### test dataset으로 넘어온 의뢰기업에 대해서
1. make_corp_seq해서 sequential data로 만들기
2. Dataset에 type = 'submission' 형태로 넣어서 만들기 -> answer로 넘어올 것 
3. 모델에 inference해서 나온 벡터 준비 완료
### test dataset으로 넘어온 대상 기업에 대해서
1. 마지막 embedding vector만 가져오기
### cosine 유사도 구하는 부분
1. 모델에 inference해서 나온 벡터와 마지막 embedding vector간의 유사도 구하기
2. measure 완료

In [56]:
import math
import numpy as np

class Measure:
    '''
    Model에 대한 Measure를 측정하는 부분
    '''
    def __init__(self,args,model,df_biz_test,target):
        self.args = args
        self.corp_seq = make_seq_main(self.args,df_biz_test,count_dict_test)
        self.model = model
        self.target = target
        self.test_dataset = Dataset(self.args, self.corp_seq, data_type="submission")
        self.test_sampler = SequentialSampler(self.test_dataset)
        self.test_dataloader = DataLoader(
        self.test_dataset, sampler=self.test_sampler, batch_size=len(self.corp_seq)
        )
    
    def calculate_similarity(self,output,target):
        cosine_sim = np.sum(output * target, -1)/(np.linalg.norm(output,axis=-1) * np.linalg.norm(target,axis=-1))
        angular_distance = (1 - np.arccos(cosine_sim) / math.pi)
        return angular_distance
    
    def inference(self):
        device = torch.device("cuda")
        with torch.no_grad():
            self.model.eval()
            tbar = tqdm(self.test_dataloader)
            for corp_id, mask, input_seq, target_pos, _ in tbar: ##
                mask = mask.to(device)
                input_seq, target_pos = input_seq.to(device), target_pos.to(device)
                output = self.model(input_seq, mask=mask)
                output = output[:, -1, :].cpu().data.numpy()
                total_sim = np.sum(self.calculate_similarity(output,self.target),-1) / len(self.corp_seq)

        return total_sim

In [57]:
patent_array = np.array(patent_array)
def infer(epoch):
    model = setting_model(args)
    model.load_state_dict(torch.load(f'./model_pt/model_state_dict_epoch{epoch}.pt'))
    measure = Measure(make_args(count_dict_test),model,df_biz_test,patent_array)
    return measure.inference()

In [60]:
for i in range(9):
    print(infer(i))

100%|██████████| 1/1 [00:01<00:00,  1.34s/it]


0.716661807586721


100%|██████████| 1/1 [00:01<00:00,  1.33s/it]


0.7178344216668051


100%|██████████| 1/1 [00:01<00:00,  1.53s/it]


0.7148797913817913


100%|██████████| 1/1 [00:01<00:00,  1.36s/it]


0.708952355954603


100%|██████████| 1/1 [00:01<00:00,  1.32s/it]


0.7147698385364968


100%|██████████| 1/1 [00:01<00:00,  1.30s/it]


0.7131158278434557


100%|██████████| 1/1 [00:01<00:00,  1.43s/it]


0.7134164706342006


100%|██████████| 1/1 [00:01<00:00,  1.34s/it]


0.7117098583040509


100%|██████████| 1/1 [00:01<00:00,  1.45s/it]

0.7142746921095051





### ANNOY

annoy의 경우에는 index와 vector 정보만으로 동작하므로 각 vector에 대한 부가정보는 따로 저장을 해두어야합니다.

In [125]:
build_annoy(df_biz,args) 

In [None]:
pt_path = '/opt/ml/transformer_pytorch_custom/model_pt/model_state_dict_epoch0.pt'
key = '100494'
transformer = setting_model(make_args(count_dict))
transformer.load_state_dict(torch.load(pt_path))
annoy_index = AnnoyIndex(args.d_embed, 'angular')
annoy_index.load(args.annoy_path)
inference = Inference(args,corp_seq,key,transformer,annoy_index,count_dict,key2idx)
# score_dict = inference.scoring(df_biz)
inference_return = inference.find_candidates(df_biz)
df_inference = inference.filtering(inference_return)