In [1]:
import os
import chardet
from src.utils.text_chunking import *

In [2]:
def detect_encoding(file_path):
    with open(file_path, "rb") as f:
        result = chardet.detect(f.read())
        return result['encoding']
    
def read_txt(file_path):
    encoding = detect_encoding(file_path)
    with open(file_path, 'r', encoding=encoding) as f:
        content = f.read()
        return content

def read_file(folder_path):
    corpus = {}
    files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
    for file in files:
        file_path = os.path.join(folder_path, file)
        content = read_txt(file_path)
        corpus[file] = content

    return corpus

In [None]:
corpus = read_file(folder_path ="processed_data")
corpus

In [4]:
word_definition = corpus["data.txt"]
FAQ = corpus['FAQ.txt']

In [5]:
record = ["T00005025.script.txt", "T00005026+27.script.txt", "T00005026.script.txt", "T00005027.script.txt", "T00005028.script.txt"]
record_text = {key: corpus[key] for key in record if key in corpus}
record_text = " ".join(record_text.values())

In [6]:
chunking = Chunking(text=word_definition)
word_definition = chunking.text_chunking(word_definition, separators=['名詞解釋', '。'], chunk_size=50, chunk_overlap=10)

In [7]:
faq = chunking.text_chunking(FAQ, separators=["KEYWORD", "KEYWORD", "KEYWORD", "KEYWORD", "KEYWORD", "KEYWORD", "KEYWORD"], chunk_size=100, chunk_overlap=20)

In [8]:
import re
from collections import defaultdict

In [None]:
insurance = ["invest1.txt","invest2.txt","invest3.txt","normal1.txt","normal3.txt","normal3.txt"]
insurance_text = {key: corpus[key] for key in insurance if key in corpus}
insurance_text = " ".join(insurance_text.values())

chunking = Chunking(text=insurance_text)

insurance_chunk = chunking.text_chunking(insurance_text,
                                         separators=["KEYWORD", "KEYWORD", "KEYWORD\KEYWORD", "KEYWORD", "KEYWORD", "KEYWORD", "KEYWORD", "KEYWORD", "？"],
                                         chunk_size=50,
                                         chunk_overlap=10)
insurance_chunk


In [10]:
record_chunk = chunking.text_chunking(record_text, separators=["。", "，"], chunk_size=100, chunk_overlap=20)

In [None]:
from src.utils.embeddings import *

In [5]:
we = WordEmbedding()

In [15]:
word_definition_embeddings = we.embedding(text=word_definition)

In [6]:
import pickle

In [17]:
with open("word_definition_embeddings", "wb") as f:
    pickle.dump(word_definition_embeddings, f)

In [19]:
faq_embeddings = we.embedding(text=faq)

In [20]:
with open("faq_embeddings", "wb") as f:
    pickle.dump(faq_embeddings, f)

In [49]:
insurance_chunk_embeddings = we.embedding(text=insurance_chunk)

In [50]:
with open("insurance_chunk_embeddings", "wb") as f:
    pickle.dump(insurance_chunk_embeddings, f)

In [23]:
record_chunk_embeddings = we.embedding(text=record_chunk)

In [24]:
with open("record_chunk_embeddings", "wb") as f:
    pickle.dump(record_chunk_embeddings, f)

幾條規則分開做embedding

In [12]:
Rule = {}

for i in ["KEYWORD", "KEYWORD", "KEYWORD\KEYWORD", "KEYWORD", "KEYWORD", "KEYWORD", "KEYWORD", "KEYWORD"]:
    Rule[f"{i}"] = []
    for text in insurance_chunk:
        if i in text:
            Rule[f"{i}"].append(text)

In [13]:
remove_words = ['請問正確嗎', '請問您是否同意', '請問您清楚嗎', '請問是否正確', '請問您是否同意']

def word_filter(data, remove_words):
    pattern = re.compile('|'.join(map(re.escape, remove_words)))
    for key, sentences in data.items():
        data[key] = [pattern.sub('', sentence) for sentence in sentences]

    return data

In [None]:
filtered_Rule = word_filter(Rule, remove_words=remove_words)
filtered_Rule

In [79]:
Rule_embedding_folder = "Rule_embedding"
os.makedirs(Rule_embedding_folder, exist_ok=True)

name_mapping = {rule: rule.replace("\n","") for rule in Rule.keys()}

for rule, contents in filtered_Rule.items():
    rule_embeddings = []
    for content in contents:
        embeddings = we.embedding(content)
        rule_embeddings.append(embeddings)
    safe_rule_name = name_mapping[rule]    
    filename = os.path.join(Rule_embedding_folder, f"{safe_rule_name}.pkl")
    with open(filename, 'wb') as f:
        pickle.dump(rule_embeddings, f)