<a href="https://colab.research.google.com/github/artizans/pythoncore/blob/main/search_engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def main(search_engine):
  for file_name in ['1.txt','2.txt','3.txt','4.txt','5.txt']:
    file_path = '/content/search_engine/source/' + file_name
    search_engine.add_corpus(file_path)
  
  while True:
    query = input()
    results = search_engine.search(query)
    print('found {} result(s):'.format(len(results)))
    for result in results:
      print(result)
  

In [None]:
class SearchEngineBase(object):
  def __init__(self) -> None:
      # super().__init__()
      print('SearchEngineBase is init...')
      pass
  def add_corpus(self,file_path):
    try:
      with open(file_path,'r') as fin:
        text = fin.read()
      self.process_corpus(file_path,text)
    except FileNotFoundError as e:
      print(e)
  
  def process_corpus(self,id,text):
    raise Exception('process_corpus is not implemented')
  
  def search(self,query):
    raise Exception('search is not implemented')
  
  

In [None]:
class SimpleEngine(SearchEngineBase):
  def __init__(self) -> None:
      super(SimpleEngine, self).__init__()
      self.__id_to_texts = {}
  def process_corpus(self, id, text):
      self.__id_to_texts[id] = text
  def search(self, query):
    results = []
    for id, text in self.__id_to_texts.items():
      if query in text:
        results.append(id)
    return results


齐夫定律：一个单词出现的频率与其在频率表里的排名成反比，呈幂律分布。

In [None]:
import re

class BOWEngine(SearchEngineBase):
  def __init__(self) -> None:
    super(BOWEngine, self).__init__()
    self.__id_to_words = {}
  
  def process_corpus(self, id, text):
    self.__id_to_words[id] = self.parse_text_to_words(text)

  def search(self, query):
    query_words = self.parse_text_to_words(query)
    results = []
    for id, words in self.__id_to_words.items():
      if self.query_match(query_words, words):
        results.append(id)
    return results
  
  @staticmethod
  def query_match(query_words, words):
    for query_word in query_words:
      if query_word not in words:
        return False
    return True
  @staticmethod
  def parse_text_to_words(text):
    # 使用正则表达式去掉标点符号和换行符
    text = re.sub(r'[^\w]', ' ', text)
    # 转为小写
    text = text.lower()
    # 生成所有单词的列表
    word_list = text.split(' ')
    # 去掉空白单词
    word_list = filter(None, word_list)
    # 返回单词的 set
    return set(word_list)


In [None]:
# search_engine = SimpleEngine()
search_engine = BOWEngine()
main(search_engine)