In [35]:
import re
import numpy as np
import pandas as pd

# 
def preprocess_sentence_code(sentence: str, action_words):
    '''
    '''
    # 大写化
    
    # 判断是否多动词共有主语
    match = re.search(r".*?(?={0}) .*(AND?={0})*".format(action_words), sentence, flags=re.I)
    if match:
        # 找出主语
        entity = re.search(r"(?P<entity>.*?)"+r"(?={0})".format(action_words), sentence, flags=re.I)
        entity = entity.group("entity")
        # 分割原句 
        sentence_split = re.sub(r" AND(?={0})".format(action_words), "|", sentence)
        sentence_ls = sentence_split.split('|')
        # 主语归位
        res_sentence_ls = []
        for i, s in enumerate(sentence_ls):
            if i == 0:
                res_sentence_ls.append(s)
            else:
                res_sentence_ls.append(entity + s)
        return res_sentence_ls
    else:
        return [sentence]

# sentence = "RWY 12 NOT AVBL DUE WIP EXC SNOW AND U/S DUE TO RAIN AND U/S" 
# action_words = " NOT AVBL| U/S"
# preprocess_sentence_code(sentence, action_words)

# def read_words(path=RULES_TABLE)
def read_words(path, sheet_name='words_list'):
    '''
    '''
    df_words = pd.read_excel(path, sheet_name=sheet_name)
    verb_ls, limit_words_ls = set(df_words['ACTION'].values), set(df_words['LIMIT'].values)
    verb_ls, limit_words_ls = [word for word in verb_ls if isinstance(word, str)], [word for word in limit_words_ls if isinstance(word, str)]
    # entity = "ILS RWY|RWY"
    action = " " + "| ".join(verb_ls)
    reason = " DUE"
    # limit = " EXCEPT | EXC | EXP | WHEN | FOLLOWING CONDITIONS | ONLY | FLW LIMITATIONS | IN CASE OF "
    limit = " " + "| ".join(limit_words_ls)
    source = " REFER| REF"
    return action, reason, limit, source

# get_item_pattern_list(path=RULES_TABLE, action_words, reason_words, limit_words, source_words)
def get_item_pattern_list(action_words, reason_words, limit_words, source_words, path, sheet_name='base_rules', ):
    '''
    '''
    df_rules = pd.read_excel(path, sheet_name=sheet_name)
    format_ls = list(set(df_rules['FORMAT'].values))
    pattern_entity_ls, pattern_action_ls, pattern_reason_ls, pattern_limit_ls, pattern_source_ls = [], [], [], [], []
    for item in format_ls:
        if item == "entity":
            pattern_entity_ls = list(df_rules.loc[df_rules['FORMAT'] == item, 'RULES'].values)
        elif item == "action":
            pattern_action_ls = list(df_rules.loc[df_rules['FORMAT'] == item, 'RULES'].values)
        elif item == "reason":
            pattern_reason_ls = list(df_rules.loc[df_rules['FORMAT'] == item, 'RULES'].values)
        elif item == "limit":
            pattern_limit_ls = list(df_rules.loc[df_rules['FORMAT'] == item, 'RULES'].values)
        elif item == "source":
            pattern_source_ls = list(df_rules.loc[df_rules['FORMAT'] == item, 'RULES'].values)
        else:
            print("error")

    # pattern_entity_ls = [p.format(entity) for p in pattern_entity_ls]
    pattern_action_ls = [p.format(action_words) for p in pattern_action_ls]
    pattern_reason_ls = [p.format(reason_words) for p in pattern_reason_ls]
    pattern_limit_ls = [p.format(limit_words) for p in pattern_limit_ls]
    pattern_source_ls = [p.format(source_words) for p in pattern_source_ls]

    return pattern_entity_ls, pattern_action_ls, pattern_reason_ls, pattern_limit_ls, pattern_source_ls


# pattern combin
def pattern_combine(pattern_entity_ls, pattern_action_ls, pattern_reason_ls, pattern_limit_ls, pattern_source_ls):
    '''
    '''
    pattern_ls = []
    # 主语 + 动词 + 原因 + 限制 + 来源
    for p_entity in pattern_entity_ls:
        for pattern_action in pattern_action_ls:
            for p_reason in pattern_reason_ls:
                for p_limit in pattern_limit_ls:
                    for p_source in pattern_source_ls:
                        pattern_ls.append(p_entity + pattern_action + p_reason + p_limit + p_source)
    # 主语 + 动词 + 原因 + 限制
    for p_entity in pattern_entity_ls:
        for pattern_action in pattern_action_ls:
            for p_reason in pattern_reason_ls:
                for p_limit in pattern_limit_ls:
                    pattern_ls.append(p_entity + pattern_action + p_reason + p_limit)
    # 主语 + 动词 + 原因
    for p_entity in pattern_entity_ls:
        for pattern_action in pattern_action_ls:
            for p_reason in pattern_reason_ls:
                pattern_ls.append(p_entity + pattern_action + p_reason)
    # 主语 + 动词 + 限制
    for p_entity in pattern_entity_ls:
        for pattern_action in pattern_action_ls:
            for p_limit in pattern_limit_ls:
                pattern_ls.append(p_entity + pattern_action + p_limit)
    # 主语 + 动词
    for p_entity in pattern_entity_ls:
        for pattern_action in pattern_action_ls:
            pattern_ls.append(p_entity + pattern_action)
    # print(len(pattern_ls))
    return pattern_ls 



In [37]:
# sentence_parse
def sentence_parse(sentence_code: str):
    '''
    '''
    RULES_TABLE = 'E:/Workstation/data/NOTAM/NOTAM_table.xlsx'

    # read_words
    action_words, reason_words, limit_words, source_words = read_words(path=RULES_TABLE, sheet_name='words_list')
    # get each item pattern list
    pattern_entity_ls, pattern_action_ls, pattern_reason_ls, pattern_limit_ls, pattern_source_ls = get_item_pattern_list(action_words, reason_words, limit_words, source_words, path=RULES_TABLE, sheet_name='base_rules')
    # pattern combine 
    pattern_ls = pattern_combine(pattern_entity_ls, pattern_action_ls, pattern_reason_ls, pattern_limit_ls, pattern_source_ls)
    # print(len(pattern_ls))
    # preprocess sentence_code
    # sentence_ls = preprocess_sentence_code(sentence_code, action_words)
    sentence_ls = [sentence_code]
    
    # sentence_parse
    is_match = False
    res_list_ls = []
    for sentence in sentence_ls:
        tmp_is_match = False
        res_dict = {item: "" for item in ['entity', 'action', 'reason', 'limit', 'limit_wings', 'limit_weight', 'source']} 
        for pattern in pattern_ls:
            match = re.search(pattern, sentence, flags=re.I)
            if match:
                tmp_is_match = True
                match_dict = match.groupdict()
                # print(match_dict)
                if 'entity' in match_dict:
                    res_dict['entity'] = match_dict['entity']
                if 'entity_supply' in match_dict:
                    res_dict['entity'] = res_dict['entity'] + ' ' + match_dict['entity_supply']
                if 'action' in match_dict:
                    res_dict['action'] = match_dict['action']
                if 'reason' in match_dict:
                    res_dict['reason'] = match_dict['reason']
                if 'limit' in match_dict:
                    res_dict['limit'] = match_dict['limit']
                if 'source' in match_dict:
                    res_dict['source'] = match_dict['source']
                # print(res_dict)
                res_list = list(res_dict.values())
                res_list = [item.strip() for item in res_list]
                # print(res_list)
                res_list_ls.append(res_list)
                break
                # return (is_match, [res_list])
        is_match = is_match or tmp_is_match
    return (is_match, res_list_ls)

# test
txt = ["RWY 12/30 U/S FOR LDG",
       "RUNWAY IMMEDIATELY AVAILABLE IN CASE OF AN EMERGENCY",
       "RWY 13/31 USABLE FOR HELICOPTERS ONLY",
       "RWY NOT AVBL FOR OPS DUE REPAIR WIP",
       "RWY 32C AVBL ONLY FOR . ACFT DEP",
       "ACFT WITH MTOW 25 TONS OR ABOVE MUST PERFORM 180 DEG TURN ON TURN",
       "RWY 17 CLSD",
       "DUE TO...",
       ]

for i, sentence in enumerate(txt):
    print(i, sentence_parse(sentence))


0 (True, [['RWY 12/30 FOR LDG', 'U/S', '', '', '', '', '']])
1 (True, [['RUNWAY IMMEDIATELY', 'AVAILABLE', '', 'IN CASE OF AN EMERGENCY', '', '', '']])
2 (True, [['RWY 13/31', 'USABLE', '', 'FOR HELICOPTERS ONLY', '', '', '']])
3 (True, [['RWY FOR OPS', 'NOT AVBL', 'DUE REPAIR WIP', '', '', '', '']])
4 (True, [['RWY 32C', 'AVBL', '', 'ONLY FOR . ACFT DEP', '', '', '']])
5 (True, [['ACFT WITH MTOW 25 TONS OR ABOVE FORM 180 DEG TURN ON TURN', 'MUST PER', '', '', '', '', '']])
6 (True, [['RWY 17', 'CLSD', '', '', '', '', '']])
7 (False, [])


In [32]:
# E项 无分句 test
test_file = "E:/Workstation/data/NOTAM/相对简单_10.23_new.xlsx"
df = pd.read_excel(test_file)["E项"]
txt = df.values
txt = [s for s in txt if isinstance(s, str)]
for i, sentence in enumerate(txt):
    print(i, sentence_parse(sentence))

0 (True, [['RWY 30 FOR TKOF AND LDG UNDER FOLLOWING CONDITIONS:X-WIND COMPONENT ON MAIN RWYS', 'AVBL', '', 'EXCEEDS 15 KTS AND ATC IS VISUAL WITH BOTH RWY ENDS RWY 12/30.', '', '', '']])
1 (True, [['RWY 11/29', 'CLOSED ALL TRAINING AND VFR FLIGHTS', '', '', '', '', '']])
2 (True, [['RWY 10R/28L', 'STRENGTH DECREASED - SOAKED RWY.ARR/DEP WITH AD OPERATOR APPROVAL ONLY. TEL +420 602731152', '', '', '', '', '']])
3 (True, [['RWY 13/31', 'NARROWED TO 22,5M', '', 'WHEN WET. FOR LDG/TKOF USE SOUTHERNHALF OF RWY.', '', '', '']])
4 (True, [['RWY 13/31', 'USABLE', '', 'FOR HELICOPTERS ONLY', '', '', '']])
5 (True, [['REF AIP SUP 12/21 PARAS 2.3 - 2.4 AND NOTAM A1171/21,CLOSURE OF CENTRE RWY (07C/25C)', 'IS CNL', '', '', '', '', ''], ['REF AIP SUP 12/21 PARAS 2.3 - 2.4 AND NOTAM A1171/21,CLOSURE OF CENTRE RWY (07C/25C)', 'VHHH ON DUAL RWY OPS DRG 252316 -262315', 'DUE HKIA CARGO STAND RE-DESIGNATION.', '', '', '', '']])
6 (True, [['RWY 06R/24L', 'CLSD. AVBL PPR 30MIN.', '', '', '', '', '']])
7 (