In [1]:
from konlpy.tag import Mecab
stemmer = Mecab()

In [7]:
import re



def contains_korean(text):
    # 정규 표현식을 사용하여 문자열에 한국어가 포함되어 있는지 확인합니다.
    return bool(re.search(r'[가-힣]', text))

def filter_korean_strings(string_list):
    # 한국어가 포함된 문자열만 필터링합니다.
    return [string for string in string_list if contains_korean(string)]

def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stopwords = file.read().splitlines()
    return stopwords

def remove_strings_with_stopwords(string_list):
    filtered_list = []
    for string in string_list:
        if not any(stopword in string for stopword in stopwords):
            filtered_list.append(string)
    return filtered_list

In [16]:
from rank_bm25 import BM25Okapi
import json
from tqdm import tqdm 

# #load stopwords
# stopwords = load_stopwords('stopwords.txt')

file_path='data/law_data/new_법령이름.json'
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)
    
corpus=[]
for i in data:
    corpus.append(i['법내용'])
tokenized_corpus = [remove_strings_with_stopwords(filter_korean_strings(stemmer.morphs(doc))) for doc in corpus]
# tokenized_corpus = [doc.split(' ') for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

NameError: name 'remove_strings_with_stopwords' is not defined

In [17]:
file_path='data/law_data/final_질문&법령.json'
with open(file_path, 'r', encoding='utf-8') as file:
    data1 = json.load(file)

In [18]:
law_dic={}
for i in data:
    law_dic[i['법이름']]=i['법내용']

In [11]:
answer_dic={}
for i in data:
    answer_dic[i['법내용']]=i['법이름']

In [104]:
#predict f
def predict(question,k):
    answer_list=[]
    # tokenized_query=question.split(' ')
    tokenized_query=stemmer.morphs(question)
    tokenized_query=filter_korean_strings(tokenized_query)
    tokenized_query=remove_strings_with_stopwords(tokenized_query)
    predict_list=bm25.get_top_n(tokenized_query, corpus, n=k)
    for pred in predict_list:
        answer_list.append(answer_dic[pred])
    return answer_list

In [13]:
#recall k 
def recall(answers, preds):
    t=0
    for ans in answers:
        if ans in preds:
            t+=1
    
    return t/len(answers)

In [14]:
#test 
def test(k):
    total=0
    num=0
    for d in tqdm(data1):
        if (len(d['법령']>6)):
            continue
        num+=1
        answers=d['법령']
        question=d['질문']
        preds=predict(question,k)
        total+=recall(answers,preds)
    return total/num

In [108]:
bm25_result=[]
for d in tqdm(data1):
    question=d['질문']
    preds=predict(question,100)
    bm25_result.append(preds)

100%|██████████| 5802/5802 [00:53<00:00, 108.06it/s]


In [16]:
import pickle
import faiss

# pkl 파일에서 임베딩 로드
with open('embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

# Faiss 인덱스 생성
dimension = embeddings.shape[1]# 임베딩의 차원
index = faiss.IndexFlatL2(dimension)  # L2 거리 사용

# 임베딩을 Faiss 인덱스에 추가
index.add(embeddings)

#predict 
def predict_e(question,k,index):
    distances, indices = index.search(question, k)
    return indices[0]

def get_preds_e(predict_number, data):
    preds=[]
    for i in predict_number:
        preds.append(data[i]['법이름'])
    return preds

def test_e(index,k,data,data1):
    with open('q_embeddings.pkl', 'rb') as f:
        q_list = pickle.load(f)
    total=0
    for q_i, q in enumerate(tqdm(q_list)):
        predict_number=predict_e(q.reshape(1,-1),k,index)
        preds=get_preds_e(predict_number, data)
        total+=recall(data1[q_i]['법령'],preds)
    return total/len(q_list)

In [17]:
ann_result=[]
with open('q_embeddings.pkl', 'rb') as f:
    q_list = pickle.load(f)
for q_i, q in enumerate(tqdm(q_list)):
    predict_number=predict_e(q.reshape(1,-1),100,index)
    preds=get_preds_e(predict_number, data)
    ann_result.append(preds)

100%|██████████| 5802/5802 [00:03<00:00, 1703.39it/s]


In [113]:
mixed_result=[]
for i in range(len(ann_result)):
    mixed_element= bm25_result[i][:50]
    for j in range(0,100):
        if ann_result[i][j] not in mixed_element:
            mixed_element.append(ann_result[i][j])
        if (len(mixed_element)==100):
            break
    mixed_result.append(mixed_element)

In [97]:
mixed_result=[]
for i in range(len(ann_result)):
    mixed_element=[]
    for j in range(100):
        if bm25_result[i][j] not in mixed_element:
            mixed_element.append(bm25_result[i][j])
        if (len(mixed_element)==100):
            break
        if ann_result[i][j] not in mixed_element:
            mixed_element.append(ann_result[i][j])
        if (len(mixed_element)==100):
            break        
    mixed_result.append(mixed_element)

In [114]:
#split(' ')
list1=[1, 10, 20, 50,100]
for i in list1:
    total=0
    for j in dev_index: 
        total+=recall(data1[j]['법령'],mixed_result[j][:i])
    print(total/len(dev_index))

0.19239816408491103
0.38793951315465947
0.4628964838947627
0.5391156462585033
0.6656954348004261


In [20]:
from model.CrossEncoder import CrossEncoder
cross_encoder = CrossEncoder('checkpoints/june/roberta-base')

In [21]:
result =cross_encoder.rank(data1[0]['질문'],data1[0]['ann'])

In [22]:
result

[{'corpus_id': 10, 'score': 0.9555122},
 {'corpus_id': 21, 'score': 0.9540267},
 {'corpus_id': 15, 'score': 0.95269436},
 {'corpus_id': 12, 'score': 0.95072687},
 {'corpus_id': 20, 'score': 0.94958},
 {'corpus_id': 19, 'score': 0.9467833},
 {'corpus_id': 6, 'score': 0.94309556},
 {'corpus_id': 7, 'score': 0.93889785},
 {'corpus_id': 16, 'score': 0.9381886},
 {'corpus_id': 4, 'score': 0.9355388},
 {'corpus_id': 23, 'score': 0.9339826},
 {'corpus_id': 31, 'score': 0.9336204},
 {'corpus_id': 13, 'score': 0.932263},
 {'corpus_id': 8, 'score': 0.9317442},
 {'corpus_id': 0, 'score': 0.9313066},
 {'corpus_id': 3, 'score': 0.93096787},
 {'corpus_id': 11, 'score': 0.929164},
 {'corpus_id': 9, 'score': 0.929021},
 {'corpus_id': 5, 'score': 0.9287259},
 {'corpus_id': 2, 'score': 0.9284755},
 {'corpus_id': 1, 'score': 0.92781055},
 {'corpus_id': 26, 'score': 0.92704934},
 {'corpus_id': 70, 'score': 0.9251868},
 {'corpus_id': 41, 'score': 0.9235862},
 {'corpus_id': 17, 'score': 0.9213349},
 {'corpu

: 

In [8]:
import random
import json

file_path='data/law_data/final_질문&법령.json'
with open(file_path, 'r', encoding='utf-8') as file:
    data1 = json.load(file)
    

def split_list(data, chunk_size=10):
    """리스트를 chunk_size 단위로 나누어 이중 리스트로 반환합니다."""
    return [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]

for d in data1:
    random_integers = random.sample(range(1464), 150)
    r_candidate=[]
    for r_i in random_integers:
        r_candidate.append(data[r_i]['법이름'])
    answer_laws=d['법령']
    for a in answer_laws:
        if a in r_candidate:
            r_candidate.remove(a)
    
    d['random']= r_candidate[:100]
    
    


In [14]:
with open('/home/tako/june/2024_demo_old/data/law_data/final_질문&법령.json', 'w', encoding='utf-8') as file:
    json.dump(data1, file, ensure_ascii=False, indent=4)