In [1]:
%cd /media/auto203/SSD2/JHY/python/enhanced-subject-verb-object-extraction-master


from typing import List
from deepmultilingualpunctuation import PunctuationModel
from transformers import pipeline
import nltk
from Exiang import chinese_or_not
from Exiang import chinese_svo
import notam
import re
import warnings
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import pandas as pd
import sentence_tools


S = """(A1912/15 NOTAMN
Q) LOVV/QWPLW/IV/BO/W/000/130/4809N01610E001
A) LOVV B) 1509261100 C) 1509261230
E) PJE WILL TAKE PLACE AT AREA LAAB IN WALDE
F) GND G) FL130)"""


warnings.filterwarnings("ignore")
MODEL = PunctuationModel()
RULES_TABLE = "NOTAM_table.xlsx"
action_words, reason_words, limit_words, source_words = sentence_tools.read_words(path=RULES_TABLE, sheet_name='words_list')
pattern_entity_ls, pattern_action_ls, pattern_reason_ls, pattern_limit_ls, pattern_source_ls = sentence_tools.get_item_pattern_list(action_words, reason_words, limit_words, source_words, path=RULES_TABLE, sheet_name='base_rules')
pattern_ls = sentence_tools.pattern_combine(pattern_entity_ls, pattern_action_ls, pattern_reason_ls, pattern_limit_ls, pattern_source_ls)


# 解析 TODO: 去除非E项内容
def notam_parse_single(e_option_noisy: str) -> str:
    # 去除E):
    e_opition = e_option_noisy.replace('E)：', '')
    # 处理换行符
    e_opition = e_opition.rstrip('\n')
    e_opition = e_opition.replace('\n', '. ')
    # 去除括号里内容
    e_opition = re.sub('\(.*?\)', '', e_opition)
    # RWY和数字分开
    e_opition = re.sub(r"(Runway|RUNWAY|RWY|TWY)([0-9/]*)", r"\1 \2", e_opition)
    # 空格合并
    e_opition = ' '.join(e_opition.split())
    return e_opition

# 解码
def notam_decode(e_option: str) -> str:
    # 整理调包格式
    s_new = S.split("E)")[0] + "E) " + e_option + "\nF)" + S.split("E)")[1].split("F)")[1]
    res = notam.Notam.from_str(s_new)
    e_option_text = res.decoded().split("E)")[1].split("F)")[0].rstrip("\n").lstrip(' ').replace('\n', ' ')
    # 字典补充
    e_option_text = e_option_text.replace("MTOW", "Maximum takeoff weight")
    e_option_text = e_option_text.replace("ILS", "Instrument landing system")
    e_option_text = e_option_text.replace("IFR", "Instrument flight rules")
    # 原始符号处理
    e_option_text = e_option_text.replace(',', ", ")
    e_option_text = e_option_text.replace(':', ": ")
    e_option_text = ' '.join(e_option_text.split())
    return e_option_text

def bad_case_or_not(e_option_text: str) -> bool:
    # 简单句第一句
    if 'FOLLOWING' in e_option_text:
        return True
    else:
        return False

def bad_case_svo(e_option_text: str) -> List[List[str]]:
    res = sentence_parse(e_option_text)
    if res[0]:
        return res[1]
    else:
        return [['', '', '', '', '', '', '']]

# 标点
def punctuation(e_option_text: str) -> str:
    e_option_text_punc = MODEL.restore_punctuation(e_option_text)
    # 去掉RWY和数字之间的句号
    e_option_text_punc = re.sub(r"(Runway|RUNWAY|RWY|TWY).?", r"\1 ", e_option_text_punc)
    # 去掉数字和常见非开头词之间的句号
    e_option_text_punc = re.sub(r"([0-9LR/]+)[:\. ]*(?=Unserviceable|CLSD|Closed|CLS|closed)", r"\1 ", e_option_text_punc)
    # 合并空格
    e_option_text_punc = ' '.join(e_option_text_punc.split())
    # 去掉管理员批准中间的句号
    e_option_text_punc = re.sub(r"(Aerodrome)[\. ]*(OPERATOR (?:APPROVAL|Approved) ONLY)", r"\1 \2", e_option_text_punc)
    # 去掉通知时间中间的句号
    e_option_text_punc = re.sub(r"(MINUTES|Minutes|MIN|)[\. ]*((?:Prior|PRIOR) (?:NOTICE|Notice|NOTIFICATION|Permission|PERMISSION))", r"\1 \2", e_option_text_punc)
    # 去掉数字后冗余的句号
    e_option_text_punc = e_option_text_punc.replace("...", "呜呜呜")
    e_option_text_punc = e_option_text_punc.replace("..", '.')
    e_option_text_punc = e_option_text_punc.replace("呜呜呜", "...")
    # bad_cases
    e_option_text_punc = e_option_text_punc.replace("IS PROHIBITED. FROM INTERSECTION Taxiway C1.", "IS PROHIBITED FROM INTERSECTION Taxiway C1.")
    e_option_text_punc = e_option_text_punc.replace("CLOSED. ALL TRAINING AND VFR FLIGHTS.", "CLOSED ALL TRAINING AND VFR FLIGHTS.")
    e_option_text_punc = e_option_text_punc.replace("Refer To METHOD OF WORKING. PLAN 001-22, STAGE 2B.", "Refer To METHOD OF WORKING PLAN 001-22, STAGE 2B.")
    return e_option_text_punc

# 判断摘要
def summarization_or_not(e_option_text_punc: str) -> bool:
    return False

# 摘要
def summarization(text: str) -> str:
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
    return summary[0]["summary_text"]

# 分句
def sentence_tokenize(text: str) -> List[str]:
    sentences = []
    temp = ''
    # 加载punkt句子分割器
    sen_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    # 对句子进行分割
    sentences_ori = sen_tokenizer.tokenize(text)
    # 句子若仅含实体则与后一个句子合并
    for sentence_ori in sentences_ori:
        x = re.match(r"^(?:Runway|RUNWAY|RWY|TWY)[:\. ]?[0-9RL/]*\.$", sentence_ori)
        if x is None:
            sentences.append(temp + sentence_ori)
            temp = ''
        else:
            temp = sentence_ori
    return sentences

# 编码
def encode(sentence: str) -> str:
    sentence_code = sentence
    return sentence_code

# 单句解析
def sentence_parse(sentence_code: str):
    # preprocess sentence_code
    sentence_ls = sentence_tools.preprocess_sentence_code(sentence_code, action_words)
    # sentence_ls = [sentence_code]
    is_match = False
    res_list_ls = []
    for sentence in sentence_ls:
        tmp_is_match = False
        res_dict = {item: "" for item in ['entity', 'action', 'reason', 'limit', 'limit_wings', 'limit_weight', 'source']}
        for pattern in pattern_ls:
            match = re.search(pattern, sentence, flags=re.I)
            if match:
                tmp_is_match = True
                match_dict = match.groupdict()
                # print(match_dict)
                if 'entity' in match_dict:
                    res_dict['entity'] = match_dict['entity']
                if 'entity_supply' in match_dict:
                    res_dict['entity'] = res_dict['entity'] + ' ' + match_dict['entity_supply']
                if 'action' in match_dict:
                    res_dict['action'] = match_dict['action']
                if 'reason' in match_dict:
                    res_dict['reason'] = match_dict['reason']
                if 'limit' in match_dict:
                    res_dict['limit'] = match_dict['limit']
                if 'source' in match_dict:
                    res_dict['source'] = match_dict['source']
                res_list = list(res_dict.values())
                res_list = [item.strip() for item in res_list]
                res_list_ls.append(res_list)
                break
        is_match = is_match or tmp_is_match
    return is_match, res_list_ls

# 合并居中单元格并保存
def to_merge(df, excel_name):
    # 按照'E项'列进行每行单元格合并
    # 'E项'列去重，确定一列需要合并成几个值
    df_key = list(set(df['E项'].values))
    wb = Workbook()
    ws = wb.active
    # 将每行数据写入ws中
    for row in dataframe_to_rows(df, index=False, header=True):
        ws.append(row)
    # 遍历去重后E项
    for i in df_key:
        # 获取E项等于指定值的几行数据
        df_id = df[df.E项 == i].index.tolist() # 索引值从0开始
        # 遍历，需要合并6列，openyxl中，读excel等的序号都是从1开始，所以合并6列，需要遍历range(1, 7)
        for j in range(1, 7):
            ws.merge_cells(start_row=df_id[0] + 2, end_row=df_id[-1] + 2, start_column=j, end_column=j) # 序号从1开始，所以行序号需要加2

    # save
    wb.save(excel_name)
    print('合并成功！')

/media/auto203/SSD2/JHY/python/enhanced-subject-verb-object-extraction-master




In [2]:
# 读取excel数据
SHIT = "相对复杂"
NOTAM = pd.read_excel("data.xlsx", sheet_name=SHIT)
for i, v in enumerate(NOTAM["E项"]):
    NOTAM["E项"][i] = notam_parse_single(v)

NOTAM["E项-人类语"] = "null"

for i, v in enumerate(NOTAM["E项"]):
    NOTAM["E项-人类语"][i] = notam_decode(v)


NOTAM

Unnamed: 0,类型,E项,E项-人类语
0,跑道数据/限制,RUNWAY S RESTRICTIONS DUE TO ON RUNWAY S DECK ...,RUNWAY S RESTRICTIONS DUE TO ON RUNWAY S DECK ...
1,跑道数据/限制,RWY 16R/34L 2469M NORTH END NOT AVBL DUE WIP R...,Runway 16R/34L 2469M NORTH Stop-end NOT Availa...
2,跑道数据/限制,RWY 06/24 AVBL PPR 30 MIN CTC 514-633-3488 EXC...,Runway 06/24 Available Prior Permission Requir...
3,跑道数据/限制,RWY 16R/34L STOPBARS EVERY SECOND LGT NOT AVBL...,Runway 16R/34L STOPBARS EVERY SECOND Lighting ...
4,跑道数据/限制,RWY 13/31 OPN FOR ACFT OPS WITH THE FLW LIMITA...,Runway 13/31 Open FOR Aircraft Operations WITH...
...,...,...,...
144,起落限制,UTAE . 1. LANDING CLEARANCE SHOULD BE REQUESTE...,UTAE . 1. LANDING CLEARANCE SHOULD BE REQUESTE...
145,起落限制,RWY 13/31 OPN FOR ACFT OPS WITH THE FLW LIMITA...,Runway 13/31 Open FOR Aircraft Operations WITH...
146,起落限制,PILOTS CARRYING OUT FLT FROM/TO KHABAROVSK/NOV...,PILOTS CARRYING OUT Flight FROM/TO KHABAROVSK/...
147,起落限制,RWY 10R/28L CLSD FOR ACFT TKOF AND LDG.,Runway 10R/28L Closed FOR Aircraft Take-off AN...


In [3]:
# 中间结果

NOTAM["E项-人类语标点符号预测"] = ''
NOTAM["E项-人类语分句"] = ''
NOTAM["是否使用分句及原因"] = ''

verbs = action_words.split('|')
verbs_human = []
for verb in verbs:
    verbs_human.append(notam_decode(verb))

for i, v in enumerate(NOTAM["E项"]):
    cnt = 0
    if chinese_or_not(v):
        NOTAM["是否使用分句及原因"][i] = "不分句，因为是中文"
    else:
        v_punct = punctuation(NOTAM["E项-人类语"][i])
        NOTAM["E项-人类语标点符号预测"][i] = v_punct
        sentences = sentence_tokenize(v_punct)
        NOTAM["E项-人类语分句"][i] = sentences
        if bad_case_or_not(v):
            NOTAM["是否使用分句及原因"][i] = "不使用分句，因为是bad_case，需要整体解析"
        elif len(sentences) == 1:
            NOTAM["是否使用分句及原因"][i] = "不使用分句，因为只有一句话"
        elif len(re.findall(r"(RWY|TWY|Runway|RUNWAY)", v)) == 1:
            NOTAM["是否使用分句及原因"][i] = "不使用分句，因为只有单一实体"
        else:
            for j in sentences:
                flag = False
                for x in verbs_human:
                    if j.__contains__(x):
                        flag = True
                        break
                if flag:
                    cnt += 1
            if cnt >= 2:
                NOTAM["是否使用分句及原因"][i] = "使用分句"
            else:
                NOTAM["是否使用分句及原因"][i] = "不使用分句，因为没有两句话及以上---存在动词表里的动词"

NOTAM.to_excel("中间结果-" + SHIT + ".xlsx",index=False)


NOTAM

Unnamed: 0,类型,E项,E项-人类语,E项-人类语标点符号预测,E项-人类语分句,是否使用分句及原因
0,跑道数据/限制,RUNWAY S RESTRICTIONS DUE TO ON RUNWAY S DECK ...,RUNWAY S RESTRICTIONS DUE TO ON RUNWAY S DECK ...,RUNWAY S RESTRICTIONS DUE TO: ON RUNWAY S DECK...,[RUNWAY S RESTRICTIONS DUE TO: ON RUNWAY S DEC...,使用分句
1,跑道数据/限制,RWY 16R/34L 2469M NORTH END NOT AVBL DUE WIP R...,Runway 16R/34L 2469M NORTH Stop-end NOT Availa...,Runway 16R/34L 2469M- NORTH Stop-end- NOT Avai...,[Runway 16R/34L 2469M- NORTH Stop-end- NOT Ava...,使用分句
2,跑道数据/限制,RWY 06/24 AVBL PPR 30 MIN CTC 514-633-3488 EXC...,Runway 06/24 Available Prior Permission Requir...,Runway 06/24 Available Prior Permission Requir...,[Runway 06/24 Available Prior Permission Requi...,使用分句
3,跑道数据/限制,RWY 16R/34L STOPBARS EVERY SECOND LGT NOT AVBL...,Runway 16R/34L STOPBARS EVERY SECOND Lighting ...,Runway 16R/34L- STOPBARS: EVERY SECOND Lightin...,[Runway 16R/34L- STOPBARS: EVERY SECOND Lighti...,使用分句
4,跑道数据/限制,RWY 13/31 OPN FOR ACFT OPS WITH THE FLW LIMITA...,Runway 13/31 Open FOR Aircraft Operations WITH...,Runway 13/31 Open FOR Aircraft Operations WITH...,[Runway 13/31 Open FOR Aircraft Operations WIT...,使用分句
...,...,...,...,...,...,...
144,起落限制,UTAE . 1. LANDING CLEARANCE SHOULD BE REQUESTE...,UTAE . 1. LANDING CLEARANCE SHOULD BE REQUESTE...,UTAE: 1. LANDING CLEARANCE SHOULD BE REQUESTED...,"[UTAE: 1., LANDING CLEARANCE SHOULD BE REQUEST...",使用分句
145,起落限制,RWY 13/31 OPN FOR ACFT OPS WITH THE FLW LIMITA...,Runway 13/31 Open FOR Aircraft Operations WITH...,Runway 13/31 Open FOR Aircraft Operations WITH...,[Runway 13/31 Open FOR Aircraft Operations WIT...,使用分句
146,起落限制,PILOTS CARRYING OUT FLT FROM/TO KHABAROVSK/NOV...,PILOTS CARRYING OUT Flight FROM/TO KHABAROVSK/...,PILOTS CARRYING OUT Flight FROM/TO KHABAROVSK/...,[PILOTS CARRYING OUT Flight FROM/TO KHABAROVSK...,不使用分句，因为没有两句话及以上---存在动词表里的动词
147,起落限制,RWY 10R/28L CLSD FOR ACFT TKOF AND LDG.,Runway 10R/28L Closed FOR Aircraft Take-off AN...,Runway 10R/28L Closed FOR Aircraft Take-off AN...,[Runway 10R/28L Closed FOR Aircraft Take-off A...,不使用分句，因为只有一句话


In [4]:
# 最终汇总

NOTAM["cache"] = ''

for i, v in enumerate(NOTAM["E项"]):
    if chinese_or_not(v):
        svo_all = chinese_svo(v)
    elif bad_case_or_not(v):
        svo_all = bad_case_svo(v)
    elif NOTAM["是否使用分句及原因"][i] == "使用分句":
        svo_all = []
        v = NOTAM["E项-人类语"][i]
        sentences = sentence_tokenize(punctuation(v))
        # 遍历一个E项所有待解析单句
        for k in sentences:
            # 把所有解析出来的结果汇总
            if sentence_parse(k)[0]:
                for m in sentence_parse(k)[1]:
                    svo_all.append(m)
    else:
        svo_all = sentence_parse(v)[1]

    # svo_all 加到对应表格
    cache = ""
    for n in svo_all:
        # resul是一个解析结果（用in间隔）
        result_single = n[0] + "/in/" + n[1] + "/in/" + n[2] + "/in/"+ n[3] + "/in/" + n[4] + "/in/" + n[5] + "/in/" + n[6]
        cache += "/out/"
        cache += result_single
        cache = cache.lstrip("/out/")
    NOTAM["cache"][i] = cache

    # debug
    # print("第{}个E项解析出来的: ".format(i+1), svo_all)
    # if i == 11:
    #     break

# 拆分多个行
NOTAM["cache"] = NOTAM["cache"].str.split("/out/")
NOTAM = NOTAM.explode("cache")

# 拆分多个列
NOTAM_cache =NOTAM["cache"].str.split('/in/', expand=True)
NOTAM = NOTAM[~NOTAM.index.duplicated(keep="first")].drop(["cache"],axis=1).join(NOTAM_cache, how="right")

# rename
NOTAM.columns = ['类型', 'E项', 'E项-人类语', "E项-人类语标点符号预测", "E项-人类语分句", "是否使用分句及原因", '实体', '动作', "原因", "限制", "限制_翼展", "限制_重量", "来源"]

# 合并居中单元格并保存
excel_name = SHIT + ".xlsx"
NOTAM.to_excel(excel_name, index=False)
to_merge(pd.read_excel(excel_name), excel_name)


NOTAM

合并成功！


Unnamed: 0,类型,E项,E项-人类语,E项-人类语标点符号预测,E项-人类语分句,是否使用分句及原因,实体,动作,原因,限制,限制_翼展,限制_重量,来源
0,跑道数据/限制,RUNWAY S RESTRICTIONS DUE TO ON RUNWAY S DECK ...,RUNWAY S RESTRICTIONS DUE TO ON RUNWAY S DECK ...,RUNWAY S RESTRICTIONS DUE TO: ON RUNWAY S DECK...,[RUNWAY S RESTRICTIONS DUE TO: ON RUNWAY S DEC...,使用分句,ACTUAL ACTIVITY KNOWLEDGE,"Available ON ATIS, 129125MHZ",,,,,
0,跑道数据/限制,RUNWAY S RESTRICTIONS DUE TO ON RUNWAY S DECK ...,RUNWAY S RESTRICTIONS DUE TO ON RUNWAY S DECK ...,RUNWAY S RESTRICTIONS DUE TO: ON RUNWAY S DECK...,[RUNWAY S RESTRICTIONS DUE TO: ON RUNWAY S DEC...,使用分句,DURING SLOTS ACTIVITY: - POSSIBLE REGULATION ON,DEPARTURE AND ON ARRIVAL,,,,,
0,跑道数据/限制,RUNWAY S RESTRICTIONS DUE TO ON RUNWAY S DECK ...,RUNWAY S RESTRICTIONS DUE TO ON RUNWAY S DECK ...,RUNWAY S RESTRICTIONS DUE TO: ON RUNWAY S DECK...,[RUNWAY S RESTRICTIONS DUE TO: ON RUNWAY S DEC...,使用分句,Threshold 2M FROM THE Runway LEFT EDGE - UNBAS...,PROHIBITED,,WHEN LANDING MIRROR IN USE,,,
1,跑道数据/限制,RWY 16R/34L 2469M NORTH END NOT AVBL DUE WIP R...,Runway 16R/34L 2469M NORTH Stop-end NOT Availa...,Runway 16R/34L 2469M- NORTH Stop-end- NOT Avai...,[Runway 16R/34L 2469M- NORTH Stop-end- NOT Ava...,使用分句,Runway 16R/34L 2469M- NORTH Stop-end-,NOT Available,DUE Work In Progress,,,,
1,跑道数据/限制,RWY 16R/34L 2469M NORTH END NOT AVBL DUE WIP R...,Runway 16R/34L 2469M NORTH Stop-end NOT Availa...,Runway 16R/34L 2469M- NORTH Stop-end- NOT Avai...,[Runway 16R/34L 2469M- NORTH Stop-end- NOT Ava...,使用分句,From Runway 34L START OF Take-off AND MARKED B...,NOT Available,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,起落限制,RWY 13/31 OPN FOR ACFT OPS WITH THE FLW LIMITA...,Runway 13/31 Open FOR Aircraft Operations WITH...,Runway 13/31 Open FOR Aircraft Operations WITH...,[Runway 13/31 Open FOR Aircraft Operations WIT...,使用分句,Runway 31 -,Limited TO Take-off Between 2200-1000 Daily AN...,,,,,
145,起落限制,RWY 13/31 OPN FOR ACFT OPS WITH THE FLW LIMITA...,Runway 13/31 Open FOR Aircraft Operations WITH...,Runway 13/31 Open FOR Aircraft Operations WITH...,[Runway 13/31 Open FOR Aircraft Operations WIT...,使用分句,Runway 13 -,"Limited TO Landing Between 2200-1000 Daily, Li...",,,,,
146,起落限制,PILOTS CARRYING OUT FLT FROM/TO KHABAROVSK/NOV...,PILOTS CARRYING OUT Flight FROM/TO KHABAROVSK/...,PILOTS CARRYING OUT Flight FROM/TO KHABAROVSK/...,[PILOTS CARRYING OUT Flight FROM/TO KHABAROVSK...,不使用分句，因为没有两句话及以上---存在动词表里的动词,,,,,,,
147,起落限制,RWY 10R/28L CLSD FOR ACFT TKOF AND LDG.,Runway 10R/28L Closed FOR Aircraft Take-off AN...,Runway 10R/28L Closed FOR Aircraft Take-off AN...,[Runway 10R/28L Closed FOR Aircraft Take-off A...,不使用分句，因为只有一句话,RWY 10R/28L,CLSD,,,,,
