## 压缩倒排索引表

In [None]:
# -*- coding: utf-8 -*-
# Author: Zhenyu Bo
# Date: 2024-11-07
"""
将词典视作单一字符串，从倒排索引表中提取词典和倒排表项，节省词项存储空间。
"""

import csv
import json
import os

def process_inverted_index(input_csv, term_string_file, term_table_file, posting_list_file):
    """
    处理倒排索引表，生成词项字符串、词项表和倒排表文件。
    参数：
    - input_csv: 倒排索引表的 CSV 文件路径。
    - term_string_file: 保存词项字符串的文件路径。
    - term_table_file: 保存词项表的文件路径。
    - posting_list_file: 保存倒排表的文件路径。
    """
    # 读取倒排索引表
    terms = []
    posting_lists = {}
    with open(input_csv, 'r', encoding='utf-8') as f_csv:
        reader = csv.DictReader(f_csv)
        for row in reader:
            word = row['word']
            id_list = json.loads(row['id_list'])
            terms.append(word)
            posting_lists[word] = id_list

    # 将词项按照字典序排序
    terms.sort()

    # 创建词项字符串并记录每个词项的起始位置
    term_string = ''
    current_position = 0
    term_pointers = {}
    for term in terms:
        term_pointers[term] = current_position
        term_string += term
        current_position += len(term)

    # 保存词项字符串
    with open(term_string_file, 'w', encoding='utf-8') as f_term_string:
        f_term_string.write(term_string)

    # 保存倒排表并记录每个倒排列表的起始偏移量
    posting_pointers = {}
    current_offset = 0
    with open(posting_list_file, 'w', encoding='utf-8', newline='\n') as f_postings:  # 指定行结束符
        for term in terms:
            id_list = posting_lists[term]
            posting_line = ' '.join(map(str, id_list)) + '\n'  # 以空格分隔的文档ID列表
            f_postings.write(posting_line)
            posting_pointers[term] = current_offset
            current_offset += len(posting_line.encode('utf-8'))  # 计算字节长度

    # 创建词项表并保存（不包含 'word'）
    with open(term_table_file, 'w', encoding='utf-8', newline='') as f_term_table:
        writer = csv.writer(f_term_table)
        writer.writerow(['doc_freq', 'posting_ptr', 'term_ptr', 'term_length'])
        for term in terms:
            doc_freq = len(posting_lists[term])
            posting_ptr = posting_pointers[term]
            term_ptr = term_pointers[term]
            term_length = len(term)
            writer.writerow([doc_freq, posting_ptr, term_ptr, term_length])

def load_term_table(term_table_file_path):
    """
    加载词项表，将其内容存储在一个列表中。
    参数：
    - term_table_file_path: 词项表文件路径，包含文档频率、倒排指针、词项指针和词项长度。
    返回：
    - term_table: 词项表列表，按字典序排序。
    """
    term_table = []
    try:
        with open(term_table_file_path, "r", encoding="UTF-8") as f_term_table:
            csv_reader = csv.DictReader(f_term_table)
            for row in csv_reader:
                term_table.append({
                    'doc_freq': int(row['doc_freq']),
                    'posting_ptr': int(row['posting_ptr']),
                    'term_ptr': int(row['term_ptr']),
                    'term_length': int(row['term_length'])
                })
    except IOError as e:
        print(f"无法打开或读取词项表文件: {e}")
    except ValueError as e:
        print(f"解析词项表数据失败: {e}")
    return term_table

def load_term_string(term_string_file_path):
    """
    加载词项字符串。
    参数：
    - term_string_file_path: 词项字符串文件路径。
    返回：
    - term_string: 词项字符串。
    """
    try:
        with open(term_string_file_path, "r", encoding="UTF-8") as f_term_string:
            term_string = f_term_string.read()
        return term_string
    except IOError as e:
        print(f"无法打开或读取词项字符串文件: {e}")
        return ""

def binary_search_term(term_string, term_table, word):
    """
    在词项字符串中使用二分查找定位词项。
    参数：
    - term_string: 词项字符串。
    - term_table: 词项表列表。
    - word: 目标词项。
    返回：
    - index: 词项在 term_table 中的索引，如果未找到则返回 -1。
    """
    left = 0
    right = len(term_table) - 1
    while left <= right:
        mid = (left + right) // 2
        term_ptr = term_table[mid]['term_ptr']
        term_length = term_table[mid]['term_length']
        mid_word = term_string[term_ptr:term_ptr + term_length]
        if mid_word == word:
            return mid
        elif mid_word < word:
            left = mid + 1
        else:
            right = mid - 1
    return -1

def query_posting_list(word, term_string, term_table, posting_list_file_path):
    """
    查询特定词项的倒排索引列表。
    参数：
    - word: 目标词项。
    - term_string: 词项字符串。
    - term_table: 词项表列表，按字典序排序。
    - posting_list_file_path: 倒排表文件路径，包含所有倒排列表项。
    返回：
    - doc_ids: 文档ID列表。如果词项不存在或查询失败，返回空列表。
    """
    index = binary_search_term(term_string, term_table, word)
    if index == -1:
        print(f"词项 '{word}' 不存在于词项表中。")
        return []

    posting_ptr = term_table[index]['posting_ptr']

    try:
        with open(posting_list_file_path, "rb") as f_postings:
            f_postings.seek(posting_ptr)
            # 读取到下一个换行符为止，获取完整的倒排列表行
            posting_bytes = bytearray()
            while True:
                byte = f_postings.read(1)
                if not byte or byte == b'\n':
                    break
                posting_bytes += byte
            posting_line = posting_bytes.decode('utf-8')
            # 将倒排列表行拆分为整数
            doc_ids = list(map(int, posting_line.strip().split()))
            return doc_ids
    except IOError as e:
        print(f"无法打开或读取倒排表文件: {e}")
        return []
    except Exception as e:
        print(f"解析倒排表时发生错误: {e}")
        return []


# 处理书籍数据
# 定义文件路径
book_input_csv = '../data/book_inverted_index_table.csv'
book_term_string_file = '../data/book_term_string.txt'
book_term_table_file = '../data/book_term_table.csv'
book_posting_list_file = '../data/book_posting_list.txt'  # 使用文本文件

# 处理倒排索引表
process_inverted_index(book_input_csv, book_term_string_file, book_term_table_file, book_posting_list_file)
print('书籍词典和倒排表已成功生成。')

# 处理电影数据
movie_input_csv = '../data/movie_inverted_index_table.csv'
movie_term_string_file = '../data/movie_term_string.txt'
movie_term_table_file = '../data/movie_term_table.csv'
movie_posting_list_file = '../data/movie_posting_list.txt'  # 使用文本文件

process_inverted_index(movie_input_csv, movie_term_string_file, movie_term_table_file, movie_posting_list_file)
print('电影词典和倒排表已成功生成。')


## 在压缩后的倒排表上查询一个词项的倒排列表

In [None]:
# 加载词项表和词项字符串
book_term_table = load_term_table(book_term_table_file)
book_term_string = load_term_string(book_term_string_file)
book_posting_list_file_path = "../data/book_posting_list.txt"

while True:
    word = input("请输入要查询的词项（输入 'exit' 退出）：").strip()
    if word.lower() == 'exit':
        print("退出查询。")
        break

    # 查询词项的倒排列表
    book_doc_ids = query_posting_list(word, book_term_string, book_term_table, book_posting_list_file_path)
    if book_doc_ids:
        print(f"词项 '{word}' 的文档ID列表：{book_doc_ids}")
    else:
        print(f"词项 '{word}' 没有对应的文档或查询失败。")

## 在压缩后的倒排表上进行布尔查询

In [None]:
# -*- coding: utf-8 -*-
# Author: Zhenyu Bo
# Date: 2024-11-14

"""
使用dictionary_as_a_string.py压缩的倒排索引表实现布尔查询。
"""

import pandas as pd
import ast
import re
from dictionary_as_a_string import load_term_string, load_term_table, query_posting_list

def read_all_ids(file_path):
    """
    读取所有ID，返回ID的集合。
    """
    all_ids = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            id = int(line.strip())
            all_ids.add(id)
    return all_ids

def tokenize(expression):
    """
    将布尔表达式分词。
    """
    tokens = re.findall(r'AND|OR|NOT|\w+|\(|\)', expression)
    return tokens

def infix_to_postfix(tokens):
    """
    将中缀表达式转换为后缀表达式（逆波兰表达式）。
    """
    precedence = {'NOT': 3, 'AND': 2, 'OR': 1}
    output = []
    operator_stack = []
    operators = {'AND', 'OR', 'NOT'}
    for token in tokens:
        if token not in operators and token not in {'(', ')'}:
            output.append(token)
        elif token == '(':
            operator_stack.append(token)
        elif token == ')':
            while operator_stack and operator_stack[-1] != '(':
                output.append(operator_stack.pop())
            operator_stack.pop()  # 弹出 '('
        else:  # 操作符
            while operator_stack and operator_stack[-1] != '(' and precedence.get(operator_stack[-1], 0) >= precedence.get(token, 0):
                output.append(operator_stack.pop())
            operator_stack.append(token)
    while operator_stack:
        output.append(operator_stack.pop())
    return output

def evaluate_postfix(postfix_tokens, term_string, term_table, posting_list_file_path, all_ids):
    """
    评估后缀表达式，返回符合条件的ID集合。
    """
    stack = []
    operators = {'AND', 'OR', 'NOT'}
    for token in postfix_tokens:
        if token not in operators:
            doc_ids = query_posting_list(token, term_string, term_table, posting_list_file_path)
            stack.append(set(doc_ids))
        elif token == 'NOT':
            operand = stack.pop()
            stack.append(all_ids - operand)
        else:
            right = stack.pop()
            left = stack.pop()
            if token == 'AND':
                stack.append(left & right)
            elif token == 'OR':
                stack.append(left | right)
    return stack.pop() if stack else set()

def display_results(result_ids, words_df):
    """
    根据查询结果的ID，从 DataFrame 中查找并打印ID和标签。
    """
    df = words_df[words_df['id'].isin(result_ids)]
    if not df.empty:
        for _, row in df.iterrows():
            id = row['id']
            words = ast.literal_eval(row['words'])
            print(f"ID: {id}")
            print(f"标签: {', '.join(words)}\n")
    else:
        print("没有符合条件的结果。")

def main():
    print("正在加载数据，请稍候...")

    # 加载书籍数据
    book_term_string = load_term_string('../data/book_term_string.txt')
    book_term_table = load_term_table('../data/book_term_table.csv')
    book_all_ids = read_all_ids('../data/Book_id.txt')
    book_words_df = pd.read_csv('../data/book_words.csv', dtype={'id': int, 'words': str})

    # 加载电影数据
    movie_term_string = load_term_string('../data/movie_term_string.txt')
    movie_term_table = load_term_table('../data/movie_term_table.csv')
    movie_all_ids = read_all_ids('../data/Movie_id.txt')
    movie_words_df = pd.read_csv('../data/movie_words.csv', dtype={'id': int, 'words': str})

    print("数据加载完成！")

    while True:
        choice = input("请选择查询类型（1 - 书籍，2 - 电影）：\n")
        if choice == '1':
            term_string = book_term_string
            term_table = book_term_table
            all_ids = book_all_ids
            words_df = book_words_df
            posting_list_file_path = '../data/book_posting_list.txt'
        elif choice == '2':
            term_string = movie_term_string
            term_table = movie_term_table
            all_ids = movie_all_ids
            words_df = movie_words_df
            posting_list_file_path = '../data/movie_posting_list.txt'
        else:
            print("输入错误，请输入 1 或 2。")
            continue  # 重新开始循环

        expression = input("请输入布尔查询表达式：\n")
        tokens = tokenize(expression)
        postfix_tokens = infix_to_postfix(tokens)
        result_ids = evaluate_postfix(postfix_tokens, term_string, term_table, posting_list_file_path, all_ids)
        if result_ids:
            print("查询结果：\n")
            display_results(result_ids, words_df)
        else:
            print("没有符合条件的结果。")

        cont = input("是否继续查询？(Y/N): ")
        if cont.strip().lower() != 'y':
            print("感谢您的使用！")
            break  # 退出循环，结束程序

if __name__ == "__main__":
    main()
