### __Section 4: Entity splitting for user issues__

In [7]:
import os
import pyahocorasick

In [None]:
class QuestionClassifier:
    
    # 1. Import feature words and complete the dictionary of negative words and question words
    def __init__(self):
        cur_dir = "/Users/yumi/Documents/JupyterWork/Projects/Q&A System/"
        
        # Path for feature words
        self.disease_path = os.path.join(cur_dir, 'dict/disease.txt')
        self.department_path = os.path.join(cur_dir, 'dict/department.txt')
        self.check_path = os.path.join(cur_dir, 'dict/check.txt')
        self.drug_path = os.path.join(cur_dir, 'dict/drug.txt')
        self.food_path = os.path.join(cur_dir, 'dict/food.txt')
        self.producer_path = os.path.join(cur_dir, 'dict/producer.txt')
        self.symptom_path = os.path.join(cur_dir, 'dict/symptom.txt')
        self.deny_path = os.path.join(cur_dir, 'dict/deny.txt')
        
        # Load feature words
        self.disease_wds= [i.strip() for i in open(self.disease_path,encoding = 'utf-8') if i.strip()]
        self.department_wds= [i.strip() for i in open(self.department_path,encoding = 'utf-8') if i.strip()]
        self.check_wds= [i.strip() for i in open(self.check_path,encoding = 'utf-8') if i.strip()]
        self.drug_wds= [i.strip() for i in open(self.drug_path,encoding = 'utf-8') if i.strip()]
        self.food_wds= [i.strip() for i in open(self.food_path,encoding = 'utf-8') if i.strip()]
        self.producer_wds= [i.strip() for i in open(self.producer_path,encoding = 'utf-8') if i.strip()]
        self.symptom_wds= [i.strip() for i in open(self.symptom_path,encoding = 'utf-8') if i.strip()]
        self.region_words = set(self.department_wds + self.disease_wds + self.check_wds + self.drug_wds + self.food_wds + self.producer_wds + self.symptom_wds)
        self.deny_words = [i.strip() for i in open(self.deny_path,encoding = 'utf-8') if i.strip()]
        
        # Build domain actree
        self.region_tree = self.build_actree(list(self.region_words))
        # Build the dictionary
        self.wdtype_dict = self.build_wdtype_dict()
        
        # Question words
        self.symptom_qwds = ['symptom', 'sign', 'phenomenon', 'indication', 'manifestation']
        self.cause_qwds = ['cause','reason', 'why', 'how come', 'how to', 'how can', 'how would', 'how could', 'why', 'for what reason', 'how to be', 'how come to be', 'leads to', 'results in']
        self.acompany_qwds = ['complication', 'concurrent', 'occur together', 'occur concurrently', 'appear together', 'appear concurrently', 'occur simultaneously', 'appear simultaneously', 'occur alongside', 'alongside', 'co-occur']
        self.food_qwds = ['diet', 'drink', 'eat', 'food', 'meals', 'nutrition', 'drink', 'dish' ,'diet restriction', 'supplement', 'health supplement', 'recipe', 'cookbook', 'consume', 'foodstuff','supplement']
        self.drug_qwds = ['medicine', 'medication', 'take medicine', 'capsule', 'oral solution', 'inflammation tablet']
        self.prevent_qwds = ['prevent', 'prevent from', 'resist', 'ward off', 'avoid','dodge','evade','steer clear of','lest','escape','circumvent','avoid','dodge','skirt around',
                             'how not to', 'how can not', 'how to not','how not',
                             'how to avoid', 'how can avoid', 'how avoid', 'how to possibly avoid',
                             'how to possibly not', 'how can I possibly not', 'how possibly not']
        self.lasttime_qwds = ['period', 'how long', 'how much time', 'how many time', 'how many days', 'how many years', 'how many hours','cure period','treatment period']
        self.cureway_qwds = ['how to treat', 'how to cure',  'how to heal', 'how to medicate', 'treatment method', 'therapy', 'what to do']
        self.cureprob_qwds = ['how likely to cure', 'how likely to heal', 'hope of cure big', 'chance', 'percentage', 'proportion', 'possibility', 'can cure', 'can treat', 'can heal']
        self.easyget_qwds = ['susceptible population', 'easy to infect', 'prone population', 'what people', 'which people', 'infect', 'contract', 'get','more likely','easy']
        self.check_qwds = ['check', 'inspection item', 'detect', 'test out', 'try out']
        self.belong_qwds = ['belongs to what department', 'belongs to', 'what department', 'department']
        self.cure_qwds = ['treat what', 'cure what', 'heal what', 'how to cure', 'main cure what', 'what use', 'what effect', 'use', 'purpose',
                          'what good', 'what benefit', 'use for', 'use to do what', 'use for what', 'need', 'want']
        return

                          
    # 2. Build the main classification function
    '''Main classification function'''
    def classify(self, question):
        data = {}
        medical_dict = self.check_medical(question)
        if not medical_dict:
            return {}
        data['args'] = medical_dict
        
        # Collect the types of entities involved in the question
        types = []
        for type_ in medical_dict.values():
            types += type_
        question_type = 'others'
        question_types = []

        # Symptoms
        if self.check_words(self.symptom_qwds, question) and ('disease' in types):
            question_type = 'disease_symptom'
            question_types.append(question_type)

        if self.check_words(self.symptom_qwds, question) and ('symptom' in types):
            question_type = 'symptom_disease'
            question_types.append(question_type)

        # Causes
        if self.check_words(self.cause_qwds, question) and ('disease' in types):
            question_type = 'disease_cause'
            question_types.append(question_type)
            
        # Complications
        if self.check_words(self.acompany_qwds, question) and ('disease' in types):
            question_type = 'disease_acompany'
            question_types.append(question_type)

        # Recommended food
        if self.check_words(self.food_qwds, question) and 'disease' in types:
            deny_status = self.check_words(self.deny_words, question)
            if deny_status:
                question_type = 'disease_not_food'
            else:
                question_type = 'disease_do_food'
            question_types.append(question_type)

        # Known food looking for diseases
        if self.check_words(self.food_qwds+self.cure_qwds, question) and 'food' in types:
            deny_status = self.check_words(self.deny_words, question)
            if deny_status:
                question_type = 'food_not_disease'
            else:
                question_type = 'food_do_disease'
            question_types.append(question_type)

        # Recommended drugs
        if self.check_words(self.drug_qwds, question) and 'disease' in types:
            question_type = 'disease_drug'
            question_types.append(question_type)

        # What disease does the drug treat
        if self.check_words(self.cure_qwds, question) and 'drug' in types:
            question_type = 'drug_disease'
            question_types.append(question_type)

        # Disease and its diagnostic tests
        if self.check_words(self.check_qwds, question) and 'disease' in types:
            question_type = 'disease_check'
            question_types.append(question_type)

        # Known test looking for related diseases
        if self.check_words(self.check_qwds+self.cure_qwds, question) and 'check' in types:
            question_type = 'check_disease'
            question_types.append(question_type)

        # Disease prevention
        if self.check_words(self.prevent_qwds, question) and 'disease' in types:
            question_type = 'disease_prevent'
            question_types.append(question_type)

        # Disease treatment duration
        if self.check_words(self.lasttime_qwds, question) and 'disease' in types:
            question_type = 'disease_lasttime'
            question_types.append(question_type)

        # Disease treatment method
        if self.check_words(self.cureway_qwds, question) and 'disease' in types:
            question_type = 'disease_cureway'
            question_types.append(question_type)

        # Disease cure probability
        if self.check_words(self.cureprob_qwds, question) and 'disease' in types:
            question_type = 'disease_cureprob'
            question_types.append(question_type)

        # Disease susceptible population
        if self.check_words(self.easyget_qwds, question) and 'disease' in types :
            question_type = 'disease_easyget'
            question_types.append(question_type)

        # If no related external query information is found, then return the description information of the disease
        if question_types == [] and 'disease' in types:
            question_types = ['disease_desc']

        # If no related external query information is found, then return the disease information related to the symptom
        if question_types == [] and 'symptom' in types:
            question_types = ['symptom_disease']

        # Combine multiple classification results into a dictionary
        data['question_types'] = question_types

        return data

                          
    # 3. Build the dictionary corresponding to words and answers
    '''Construct the type corresponding to words'''
    def build_wdtype_dict(self):
        wd_dict = dict()
        for wd in self.region_words:
            wd_dict[wd] = []
            if wd in self.disease_wds:
                wd_dict[wd].append('disease')
            if wd in self.department_wds:
                wd_dict[wd].append('department')
            if wd in self.check_wds:
                wd_dict[wd].append('check')
            if wd in self.drug_wds:
                wd_dict[wd].append('drug')
            if wd in self.food_wds:
                wd_dict[wd].append('food')
            if wd in self.symptom_wds:
                wd_dict[wd].append('symptom')
            if wd in self.producer_wds:
                wd_dict[wd].append('producer')
        return wd_dict
                          
            
    # 4. Filtering sentences and classification                      
    '''Build actree to speed up filtering'''
    def build_actree(self, wordlist):
        actree = pyahocorasick.Automaton()
        for index, word in enumerate(wordlist):
            actree.add_word(word, (index, word))
        actree.make_automaton()
        return actree

    '''Sentence filtering'''
    def check_medical(self, question):
        region_wds = []
        for i in self.region_tree.iter(question):
            wd = i[1][1]
            region_wds.append(wd)
        stop_wds = []
        for wd1 in region_wds:
            for wd2 in region_wds:
                if wd1 in wd2 and wd1 != wd2:
                    stop_wds.append(wd1)
        final_wds = [i for i in region_wds if i not in stop_wds]
        final_dict = {i:self.wdtype_dict.get(i) for i in final_wds}
            
        return final_dict

    '''Classification based on feature words'''
    def check_words(self, wds, sent):
        for wd in wds:
            if wd in sent:
                return True
        return False
                          
                          
# 5. Run the function                         
if __name__ == '__main__':
    handler = QuestionClassifier()
    while 1:
        question = input('input a question:')
        data = handler.classify(question)
        print(data)