### AI Science-Technology Similarity Research
- Made by: Sukhee Lee (Ph.D. Candidate) & Keungoui Kim (Ph.D.)
- Goal: Measuring Science-Technology Similarity
- Data set: WoS

In [1]:
import torch
print("Is CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(0)) 

Is CUDA available: True
CUDA device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 4080 SUPER


#### Data Import & Preparation

In [2]:
dir = "H:/GD_awekimm/[YU]/[Research]/02_이석희/01_AI_scitech_similarity/"

In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
import re

from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

import torch
from transformers import LongformerTokenizer, LongformerModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

import spacy
import torch
from transformers import BertTokenizer, BertModel

pd.reset_option('display.max_colwidth')

In [7]:
pat_pub_similar_all = pd.read_csv(dir+'pat_pub_similar_all.csv')

In [8]:
print({
    'patent_count' : pat_pub_similar_all['patent_id'].nunique(),
    'article_count' : pat_pub_similar_all['pubid'].nunique(),
    'full length' : len(pat_pub_similar_all)
})

{'patent_count': 14293, 'article_count': 45296, 'full length': 76960}


In [12]:
pat_pub_similar_all.head(10)

Unnamed: 0,patent_id,pat_abs,patyear,pubid,pub_abs,pubyear,period,bert_cased_similar
0,7747044,A Bayesian belief networkbased architecture fo...,2010.0,2170002.0,Integration of various fingerprint matching al...,1999.0,1.0,0.778748
1,7277306,A CAM unit has a memory array for storing stor...,2007.0,54718459.0,Placement of errorcorrectingcode ECC systems o...,1991.0,1.0,0.859097
2,8065625,A GUI evaluation system includes an expression...,2011.0,10234.0,Designing user interfaces with consistent visu...,1997.0,1.0,0.718306
3,7395253,A Lagrangian support vector machine solves pro...,2008.0,852158.0,The tutorial starts with an overview of the co...,1998.0,1.0,0.833586
4,7395253,A Lagrangian support vector machine solves pro...,2008.0,1446523.0,A plane separating two point sets in ndimensio...,1999.0,1.0,0.761514
5,7395253,A Lagrangian support vector machine solves pro...,2008.0,1881981.0,Successive overrelaxation SOR for symmetric li...,1999.0,1.0,0.865539
6,7395253,A Lagrangian support vector machine solves pro...,2008.0,3948153.0,Smoothing methods extensively used for solving...,2001.0,1.0,0.783934
7,7395253,A Lagrangian support vector machine solves pro...,2008.0,4212244.0,Classification of human tumors according to th...,2001.0,1.0,0.84152
8,7395253,A Lagrangian support vector machine solves pro...,2008.0,55094576.0,We discuss progress in the development of auto...,1992.0,1.0,0.859736
9,7395253,A Lagrangian support vector machine solves pro...,2008.0,58224689.0,We consider the unconstrained minimization of ...,1995.0,1.0,0.855521


#### Measuring Similarity with Cross Encoder

In [None]:
from sentence_transformers import CrossEncoder

# Cross-Encoder Model
model = CrossEncoder('cross-encoder/stsb-roberta-base') # cross-encoder/ms-marco-MiniLM-L-12-v2

similarity_score_list = []
for i in range(pat_pub_similar_all.shape[0]):
    pair = [(pat_pub_similar_all['pat_abs'][i], pat_pub_similar_all['pub_abs'][i])]
    similarity_score = model.predict(pair)[0]
    similarity_score_list.append(similarity_score)
    
pat_pub_similar_all['similarity_score'] = similarity_score_list
pat_pub_similar_all.to_csv(dir+'pat_pub_similar_all_250210.csv', index=False)

#### (OLD) Measuring Similarity with Cosine-Similarity

In [None]:
# BERT(cased) tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained('bert-base-cased')d

# spaCy 모델 로드
nlp = spacy.load("en_core_web_sm")

# 문단을 문장 단위로 분리하는 함수
def split_into_sentences(paragraph):
    doc = nlp(paragraph)
    return [sent.text for sent in doc.sents]

# 문장 리스트를 생성하는 함수
def format_sentences_as_list(paragraph):
    sentences = split_into_sentences(paragraph)
    return [[sentence] for sentence in sentences]  # 각 문장을 ['sentence.'] 형식으로 변환

# 문장 임베딩 계산 함수
def get_embedding(texts, tokenizer, model):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  #[:, 0, :]  # CLS 토큰 임베딩 ▶▶▶ 평균, dim=1로 수정
    return embeddings

# 두 문단 간 유사도 계산 함수
def compute_paragraph_similarity(pat_abs, pub_abs, tokenizer, model):
    # 문단을 문장 단위로 나누고 리스트로 변환
    pat_sentences = format_sentences_as_list(pat_abs)
    pub_sentences = format_sentences_as_list(pub_abs)
    
    # 문장 임베딩 계산
    pat_embeddings = [get_embedding(sent, tokenizer, model) for sent in pat_sentences]
    pub_embeddings = [get_embedding(sent, tokenizer, model) for sent in pub_sentences]
    
    # 문장 쌍 간 유사도 계산
    similarities = []
    for pat_emb in pat_embeddings:
        for pub_emb in pub_embeddings:
            sim = cosine_similarity(pat_emb.numpy(), pub_emb.numpy())[0][0]
            similarities.append(sim)
    
    # 평균 유사도 반환
    return sum(similarities) / len(similarities)


# 데이터프레임의 각 행에 대해 유사도 계산
def calculate_similarities(dataframe, tokenizer, model):
    similarities = []
    for _, row in dataframe.iterrows():
        pat_abs = row["pat_abs"]
        pub_abs = row["pub_abs"]
        similarity = compute_paragraph_similarity(pat_abs, pub_abs, tokenizer, model)
        similarities.append(similarity)
    return similarities

# 메인 실행 부분
if __name__ == "__main__":
    # BERT 모델 및 토크나이저 로드
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    model = BertModel.from_pretrained("bert-base-cased")
    # 유사도 계산
    pat_pub_similar_all["bert_cased_similar"] = calculate_similarities(pat_pub_similar_all, tokenizer, model)
