In [6]:
from bs4 import BeautifulSoup

In [7]:
# test dataset
"""
<doc id=650677>
	<EN-summary>
		Think of the Internet of Things: Sci-fi scripts in life
	</EN-summary>
	<EN-summary-human-corrected>
		think of the internet of things : sci-fi scripts in life
	</EN-summary-human-corrected>
	<Back-Translated-ZH-summary>
		想想物联网：生活中的科幻剧本
	</Back-Translated-ZH-summary>
</doc>
"""
def test_data_parse(filename):
    test_source_document_ids = []
    test_summary_english = []
    xml = open(filename)
    datapoint = BeautifulSoup(xml).findAll('doc')
    for doc in datapoint:
        test_source_document_ids.append(doc['id'])
        test_summary_english.append(doc.find('en-summary-human-corrected').text.lower().strip())
    return test_source_document_ids, test_summary_english

In [8]:
# valid and train dataset
"""
<doc id=1284817>
	<EN-summary>
		Communication, a lifeline that cannot be interrupted
	</EN-summary>
	<Back-Translated-ZH-summary>
		通信，一条不可中断的生命线
	</Back-Translated-ZH-summary>
</doc>
"""
def train_valid_data_parse(filename):
    train_source_document_ids = []
    train_summary_english = []
    xml = open(filename)
    datapoint = BeautifulSoup(xml).findAll('doc')
    xml.close()
    for doc in datapoint:
        train_source_document_ids.append(doc['id'])
        train_summary_english.append(doc.find('en-summary').text.lower().strip())
    return train_source_document_ids, train_summary_english

In [9]:
# LCSTS dataset
"""
<doc id=650677>
    <summary>
        物联网随想：生活中的科幻剧本
    </summary>
    <short_text>
        你走到家门口，大门识别到你口袋里的密匙卡，为你自动解锁。外面很冷，但是门的另一边却是温暖舒适的23度，因为恒温器根据你离家的距离计算出你回家所需时间，提前点燃了壁炉。随着你踏入室内，嵌入式地灯照亮了通往厨房的道路。
    </short_text>
</doc>
"""
def read_source_document(filename):
    documents_dic = {}
    xml = open(filename,'r')
    documents = BeautifulSoup(xml).findAll('doc')
    xml.close()
    for doc in documents:
        doc_id = doc['id']
        if doc_id not in documents_dic:
            documents_dic[doc_id] = {}
            documents_dic[doc_id]['article'] = doc.find('short_text').text.strip()
            documents_dic[doc_id]['summary'] = doc.find('summary').text.strip()
    return documents_dic

def source_document_parse(documents_dic, document_ids):
    source_document = []
    summary_chinese = []
    for doc_id in document_ids:
        source_document.append(documents_dic[doc_id]['article'])
        summary_chinese.append(documents_dic[doc_id]['summary'])
    return source_document, summary_chinese

In [20]:
def write_files(source_document, summary_english, summary_chinese, phase):
    # write source document file
    with open('data/raw/'+phase+'.source.txt','w') as f:
        for doc in source_document:
            f.write('{}\n'.format(doc.strip()))
    # write english summary file
    with open('data/raw/'+phase+'.summary.en.txt','w') as f:
        for summary in summary_english:
            f.write('{}\n'.format(summary.replace('\n','').lower().strip()))
    # write chinese summary file
    with open('data/raw/'+phase+'.summary.zh.txt','w') as f:
        for summary in summary_chinese:
            f.write('{}\n'.format(summary.strip()))

In [11]:
test_source_document_ids, test_summary_english = test_data_parse('./test/ZH2ENSUM_test.txt')

In [None]:
train_source_document_ids, train_summary_english = train_valid_data_parse('./drive-download-20191111T160803Z-001/ZH2ENSUM_train.txt')
valid_source_document_ids,valid_summary_english = train_valid_data_parse('./drive-download-20191111T160803Z-001/ZH2ENSUM_valid.txt')

In [82]:
documents_dic = {}
filename='./LCSTS2.0/DATA/PART_I.txt'
xml = open(filename,'r')
documents = BeautifulSoup(xml).findAll('doc')
xml.close()

In [94]:
len(documents)

2400591

In [90]:
filename='./LCSTS2.0/DATA/PART_1a.txt'
xml = open(filename,'r')
documents2 = BeautifulSoup(xml).findAll('doc')
xml.close()

In [91]:
len(documents2)

319379

In [92]:
documents.extend(documents2)

In [116]:
# explicitly setting documents with id 2081211 because it contains ampersand and we are reading the file as html 
# which does not read &
markup = '''<doc id=2081211>
    <summary>
        RIRI&amp;#M.A.C#圣诞限量彩妆系列
    </summary>
    <short_text>
        今个系列特别推出指甲油、亮泽防水眼线液、炫目珍珠眼影组合、古铜蜜粉饼，与及双头设计的眼影扫。以独特的珍珠白色为包装，与RiRi签名的玫瑰金装饰细节点缀。12月13日限量登场！
    </short_text>
</doc>'''
soup = BeautifulSoup(markup)
documents[2081211]=soup.find('doc')

In [119]:
documents_dic = {}
for doc in documents:
    doc_id = doc['id']
    if doc_id not in documents_dic:
        documents_dic[doc_id] = {}
        documents_dic[doc_id]['article'] = doc.find('short_text').text.strip()
        documents_dic[doc_id]['summary'] = doc.find('summary').text.strip()

## Writing raw data in files

In [125]:
# creating raw train files
train_source_document, train_summary_chinese = source_document_parse(documents_dic, train_source_document_ids)
write_files(train_source_document, train_summary_english, train_summary_chinese, 'train')
# creating raw validation files
valid_source_document, valid_summary_chinese = source_document_parse(documents_dic, valid_source_document_ids)
write_files(valid_source_document, valid_summary_english, valid_summary_chinese, 'valid')
# creating raw test files
test_source_document, test_summary_chinese = source_document_parse(documents_dic, test_source_document_ids)
write_files(test_source_document, test_summary_english, test_summary_chinese, 'test')

In [130]:
import pickle
with open('documents_dic.pkl','wb') as f:
    pickle.dump(documents_dic,f)

In [1]:
import pickle

In [12]:
with open('documents_dic.pkl','rb') as f:
    data = pickle.load(f)

In [16]:
documents_dic = data

In [18]:
train_source_document_ids, train_summary_english = train_valid_data_parse('./test/ZH2ENSUM_train.txt')


In [19]:
len(train_summary_english)

1693713

In [21]:
#train_source_document, train_summary_chinese = source_document_parse(documents_dic, train_source_document_ids)
train_source_document, train_summary_chinese = [],[]
write_files(train_source_document, train_summary_english, train_summary_chinese, 'train')