In [26]:
import os
import re
import string
import spacy
import json
import pandas
import numpy as np
import pickle as pkl
import xml.etree.ElementTree as ET

from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
from process_text import process_text

In [27]:
nlp = spacy.load('en')
relevance2label = {'Good': 0, 'PotentiallyUseful': 1, 'Bad': 2}

In [28]:
def SemEval16or17_sample(filename):
    root = ET.parse(filename).getroot()
    for thread in root.findall('Thread'):
        question = thread.find('RelQuestion')
        
        q_id = question.get('RELQ_ID')
        q_category = question.get('RELQ_CATEGORY')
        q_date = question.get('RELQ_DATE')
        q_userid = question.get('RELQ_USERID')
        q_username = question.get('RELQ_USERNAME')
        
        q_subject = question.find('RelQSubject').text or ''
        q_body = question.find('RelQBody').text or ''
        
        for relcomment in thread.findall('RelComment'):
            c_id = relcomment.get('RELC_ID')
            c_date = relcomment.get('RELC_DATE')
            c_userid = relcomment.get('RELC_USERID')
            c_username = relcomment.get('RELC_USERNAME')
            Relevance = relcomment.get('RELC_RELEVANCE2RELQ')
            cTEXT = relcomment.find('RelCText').text or ''
#             if len(q_subject) == 0 or len(q_body) == 0 or len(cTEXT) == 0:
#                 print(c_id, len(q_subject), len(q_body), len(cTEXT))
#                 continue
            yield [q_id, c_id, q_category, q_subject, q_body, cTEXT, Relevance, q_userid, c_userid]

In [29]:
# create dataFrame
columns = ['q_id', 'c_id', 'q_category', 'q_subject', 'q_body', 'cTEXT', 'Relevance', 'q_userid', 'c_userid']

In [30]:
filepath = '../raw_data'
all_samples = {}
data_filename_list = os.listdir(filepath)

for name in data_filename_list:
    print('\n\t\t处理文件：%s\n' % name)
    filename = os.path.join(filepath, name)
    samples = SemEval16or17_sample(filename)
#     samples = [sample for sample in tqdm(samples)]
    all_samples[name] = pandas.DataFrame(columns=columns, data=samples)
    all_samples[name].head()


		处理文件：15dev.xml


		处理文件：15test.xml


		处理文件：15train.xml


		处理文件：16dev.xml


		处理文件：16test.xml


		处理文件：16train1.xml


		处理文件：16train2.xml


		处理文件：17test.xml



In [31]:
all_samples['15dev.xml'].head(5)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604
2,Q2481,Q2481_C3,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If the company is from Oil and Gas Industry or...,Good,U8902,U2316
3,Q2481,Q2481_C4,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,Transport in the city is a nightmare.,Good,U8902,U5547
4,Q2481,Q2481_C5,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,And life her is so relaxed that bachelors are ...,Bad,U8902,U5547


In [32]:
import copy


keys_lists = all_samples.keys()

for key in keys_lists:
    print(key)
    samples = all_samples[key]
    samples['cate_index'] = samples['q_category'].apply(lambda x: Qcategory_dic[x])
    samples['rel_index'] = samples['Relevance'].apply(lambda x: Relevance_dic[x])
    Relevance_dic_v2 = copy.copy(Relevance_dic)
    Relevance_dic_v2['Bad'] = 1
    samples['Rrel_index'] = samples['Relevance'].apply(lambda x: Relevance_dic_v2[x])
    

15dev.xml
15test.xml
15train.xml
16dev.xml
16test.xml
16train1.xml
16train2.xml
17test.xml


In [33]:
print(all_samples['15dev.xml'].iloc[1, :])

q_id                                                      Q2481
c_id                                                   Q2481_C2
q_category                                        Life in Qatar
q_subject                                   from DUBAI to QATAR
q_body        i am currently working here in dubai and i got...
cTEXT         depends on where the accommodation is.. how ma...
Relevance                                                  Good
q_userid                                                  U8902
c_userid                                                   U604
cate_index                                                    9
rel_index                                                     0
Rrel_index                                                    0
Name: 1, dtype: object


In [34]:
# tokenize (with lemm)
import nltk

wn_lemmatizer = nltk.stem.WordNetLemmatizer()

for key in keys_lists:
    print(key)
    samples = all_samples[key]
    
    def _tokenizer(x):
        if x is None:
            return ["[SEP]"]*2
        sent = nltk.word_tokenize(x.lower())
        return ["[SEP]"] + sent + ["[SEP]"]
    
    samples['q_sub_token'] = samples['q_subject'].apply(_tokenizer)
    samples['q_body_token'] = samples['q_body'].apply(_tokenizer)
    samples['cTEXT_token'] = samples['cTEXT'].apply(_tokenizer)

    
    samples['q_sub_lemma'] = samples['q_sub_token'].apply(lambda tokens: [wn_lemmatizer.lemmatize(x) for x in tokens])
    samples['q_body_lemma'] = samples['q_body_token'].apply(lambda tokens: [wn_lemmatizer.lemmatize(x) for x in tokens])
    samples['cTEXT_lemma'] = samples['cTEXT_token'].apply(lambda tokens: [wn_lemmatizer.lemmatize(x) for x in tokens])
    
    def _process_text(x):
        if x is None:
            return ["[SEP]"]*2
        sent = process_text(x)
        return ["[SEP]"] + sent + ["[SEP]"]
    
    samples['q_sub_lemma_pro'] = samples['q_subject'].apply(_process_text)
    samples['q_body_lemma_pro'] = samples['q_body'].apply(_process_text)
    samples['cTEXT_lemma_pro'] = samples['cTEXT'].apply(_process_text)
    

15dev.xml
15test.xml
15train.xml
16dev.xml
16test.xml
16train1.xml
16train2.xml
17test.xml


In [35]:
all_samples['15dev.xml'].head(2)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid,cate_index,...,Rrel_index,q_sub_token,q_body_token,cTEXT_token,q_sub_lemma,q_body_lemma,cTEXT_lemma,q_sub_lemma_pro,q_body_lemma_pro,cTEXT_lemma_pro
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263,9,...,1,"[[SEP], from, dubai, to, qatar, [SEP]]","[[SEP], i, am, currently, working, here, in, d...","[[SEP], if, you, are, single, then, its, ok, y...","[[SEP], from, dubai, to, qatar, [SEP]]","[[SEP], i, am, currently, working, here, in, d...","[[SEP], if, you, are, single, then, it, ok, yo...","[[SEP], from, dubai, to, qatar, [SEP]]","[[SEP], i, am, currently, working, here, in, d...","[[SEP], if, you, are, single, then, it, ok, yo..."
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604,9,...,0,"[[SEP], from, dubai, to, qatar, [SEP]]","[[SEP], i, am, currently, working, here, in, d...","[[SEP], depends, on, where, the, accommodation...","[[SEP], from, dubai, to, qatar, [SEP]]","[[SEP], i, am, currently, working, here, in, d...","[[SEP], depends, on, where, the, accommodation...","[[SEP], from, dubai, to, qatar, [SEP]]","[[SEP], i, am, currently, working, here, in, d...","[[SEP], depends, on, where, the, accommodation..."


In [36]:
char2index = {key: value+1 for value, key in enumerate(string.ascii_letters + string.digits + string.punctuation)}
char2index['//'] = len(char2index) + 1

def count_word_number(text, word_count):
    for token in text:
        if token in word_count:
            word_count[token] += 1
        else:
            word_count[token] = 1

def char_tokenizer(text):
    char_text = []
    for token in text:
        token_ = []
        if token == '[SEP]':
            token_.append(char2index['//'])
        else:
            for c in token:
                if c not in char2index:
                    char2index[c] = len(char2index)+1
                token_.append(char2index[c])
        char_text.append(token_)
    return char_text

def load_glove(filename):
    '''

    2018-11-14: add supporting word2vector

    '''
    word_dic = {}
    if 'glove' in filename:
        print('\nload word dictionary starting!')

        with open(filename, encoding='utf-8') as fr:
            lines = [line for line in fr]
            for line in lines:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                word_dic[word] = coefs

        print('load word dictionary ending!\n')
    else:
        print('\nload word dictionary starting!')
        with open(filename, 'rb') as fr:
            word_dic = pkl.load(fr, encoding='bytes')
        print('load word dictionary ending!\n')

    return word_dic

In [37]:
word_count_token = {}
word_count_lemma = {}
word_count_lemma_pro = {}

word_dic = load_glove('../assist/glove.vectors.window20.txt')

for key in keys_lists:
    print(key)
    samples = all_samples[key]
    samples['q_sub_token'].apply(count_word_number, args=(word_count_token,))
    samples['q_body_token'].apply(count_word_number, args=(word_count_token,))
    samples['cTEXT_token'].apply(count_word_number, args=(word_count_token,))
    
    samples['q_sub_lemma'].apply(count_word_number, args=(word_count_lemma,))
    samples['q_body_lemma'].apply(count_word_number, args=(word_count_lemma,))
    samples['cTEXT_lemma'].apply(count_word_number, args=(word_count_lemma,))
    
    samples['q_sub_lemma_pro'].apply(count_word_number, args=(word_count_lemma_pro,))
    samples['q_body_lemma_pro'].apply(count_word_number, args=(word_count_lemma_pro,))
    samples['cTEXT_lemma_pro'].apply(count_word_number, args=(word_count_lemma_pro,))


load word dictionary starting!
load word dictionary ending!

15dev.xml
15test.xml
15train.xml
16dev.xml
16test.xml
16train1.xml
16train2.xml
17test.xml


In [38]:
savepath = '../assistFORtri'
max_vocab = 10000  # 最大词汇数量

print('\n--------------------------------------')
word_count_ = sorted(word_count_token.items(), key=lambda x: x[1], reverse=True)
print('\t获得word2index并保存, 总词数为{}'.format(len(word_count_)))
with open(os.path.join(savepath, 'word_count_token.json'), 'w') as fw:
    json.dump(word_count_, fw)

word_count_ = word_count_[:max_vocab]
word2index_token = {word: index + 1 for index, (word, _) in enumerate(word_count_)}
print('\t\t总词数为：%d' % len(word2index_token))
with open(os.path.join(savepath, 'word2index_token.json'), 'w') as fw:
    json.dump(word2index_token, fw)

    
print('\n--------------------------------------')
word_count_ = sorted(word_count_lemma.items(), key=lambda x: x[1], reverse=True)
print('\t获得word2index并保存, 总词数为{}'.format(len(word_count_)))
with open(os.path.join(savepath, 'word_count_lemma.json'), 'w') as fw:
    json.dump(word_count_, fw)

word_count_ = word_count_[:max_vocab]
word2index_lemma = {word: index + 1 for index, (word, _) in enumerate(word_count_)}
print('\t\t总词数为：%d' % len(word2index_lemma))
with open(os.path.join(savepath, 'word2index_lemma.json'), 'w') as fw:
    json.dump(word2index_lemma, fw)

    
print('\n--------------------------------------')
word_count_ = sorted(word_count_lemma_pro.items(), key=lambda x: x[1], reverse=True)
print('\t获得word2index并保存, 总词数为{}'.format(len(word_count_)))
with open(os.path.join(savepath, 'word_count_lemma_pro.json'), 'w') as fw:
    json.dump(word_count_, fw)

word_count_ = word_count_[:max_vocab]
word2index_lemma_pro = {word: index + 1 for index, (word, _) in enumerate(word_count_)}
print('\t\t总词数为：%d' % len(word2index_lemma_pro))
with open(os.path.join(savepath, 'word2index_lemma_pro.json'), 'w') as fw:
    json.dump(word2index_lemma_pro, fw)


--------------------------------------
	获得word2index并保存, 总词数为66616
		总词数为：10000

--------------------------------------
	获得word2index并保存, 总词数为62767
		总词数为：10000

--------------------------------------
	获得word2index并保存, 总词数为41544
		总词数为：10000


In [39]:
all_samples['15dev.xml'].head(2)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid,cate_index,...,Rrel_index,q_sub_token,q_body_token,cTEXT_token,q_sub_lemma,q_body_lemma,cTEXT_lemma,q_sub_lemma_pro,q_body_lemma_pro,cTEXT_lemma_pro
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263,9,...,1,"[[SEP], from, dubai, to, qatar, [SEP]]","[[SEP], i, am, currently, working, here, in, d...","[[SEP], if, you, are, single, then, its, ok, y...","[[SEP], from, dubai, to, qatar, [SEP]]","[[SEP], i, am, currently, working, here, in, d...","[[SEP], if, you, are, single, then, it, ok, yo...","[[SEP], from, dubai, to, qatar, [SEP]]","[[SEP], i, am, currently, working, here, in, d...","[[SEP], if, you, are, single, then, it, ok, yo..."
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604,9,...,0,"[[SEP], from, dubai, to, qatar, [SEP]]","[[SEP], i, am, currently, working, here, in, d...","[[SEP], depends, on, where, the, accommodation...","[[SEP], from, dubai, to, qatar, [SEP]]","[[SEP], i, am, currently, working, here, in, d...","[[SEP], depends, on, where, the, accommodation...","[[SEP], from, dubai, to, qatar, [SEP]]","[[SEP], i, am, currently, working, here, in, d...","[[SEP], depends, on, where, the, accommodation..."


In [40]:
for key in keys_lists:
    print(key)
    samples = all_samples[key]
    token_count = len(word2index_token)
    token_replace_func = lambda sent: [word2index_token.get(x, token_count+1) 
                                       for x in sent]
    samples['q_sub_token_index'] = samples['q_sub_token'].apply(token_replace_func)
    samples['q_body_token_index'] = samples['q_body_token'].apply(token_replace_func)
    samples['cTEXT_token_index'] = samples['cTEXT_token'].apply(token_replace_func)
    
    lemma_count = len(word2index_lemma)
    lemma_replace_func = lambda sent: [word2index_lemma.get(x, lemma_count+1)
                                       for x in sent]
    samples['q_sub_lemma_index'] = samples['q_sub_lemma'].apply(lemma_replace_func)
    samples['q_body_lemma_index'] = samples['q_body_lemma'].apply(lemma_replace_func)
    samples['cTEXT_lemma_index'] = samples['cTEXT_lemma'].apply(lemma_replace_func)
    
    lemma_pro_count = len(word2index_lemma_pro)
    lemma_pro_replace_func = lambda sent: [word2index_lemma_pro.get(x, lemma_pro_count+1)
                                           for x in sent]
    samples['q_sub_lemma_pro_index'] = samples['q_sub_lemma_pro'].apply(lemma_pro_replace_func)
    samples['q_sub_lemma_pro_index'] = samples['q_sub_lemma_pro'].apply(lemma_pro_replace_func)
    samples['cTEXT_lemma_pro_index'] = samples['cTEXT_lemma_pro'].apply(lemma_pro_replace_func)

15dev.xml
15test.xml
15train.xml
16dev.xml
16test.xml
16train1.xml
16train2.xml
17test.xml


In [41]:
all_samples['15dev.xml'].head(2)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid,cate_index,...,q_body_lemma_pro,cTEXT_lemma_pro,q_sub_token_index,q_body_token_index,cTEXT_token_index,q_sub_lemma_index,q_body_lemma_index,cTEXT_lemma_index,q_sub_lemma_pro_index,cTEXT_lemma_pro_index
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263,9,...,"[[SEP], i, am, currently, working, here, in, d...","[[SEP], if, you, are, single, then, it, ok, yo...","[1, 42, 210, 5, 23, 1]","[1, 6, 58, 434, 151, 50, 9, 210, 10, 6, 125, 7...","[1, 24, 12, 21, 426, 119, 97, 345, 12, 18, 550...","[1, 42, 219, 5, 23, 1]","[1, 6, 57, 457, 156, 50, 9, 219, 10, 6, 132, 7...","[1, 24, 12, 21, 429, 124, 14, 350, 12, 18, 572...","[1, 42, 218, 5, 23, 1]","[1, 24, 13, 20, 421, 122, 15, 308, 13, 17, 569..."
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604,9,...,"[[SEP], i, am, currently, working, here, in, d...","[[SEP], depends, on, where, the, accommodation...","[1, 42, 210, 5, 23, 1]","[1, 6, 58, 434, 151, 50, 9, 210, 10, 6, 125, 7...","[1, 634, 29, 60, 4, 682, 5502, 48, 129, 110, 3...","[1, 42, 219, 5, 23, 1]","[1, 6, 57, 457, 156, 50, 9, 219, 10, 6, 132, 7...","[1, 655, 29, 59, 4, 668, 5050, 48, 134, 111, 3...","[1, 42, 218, 5, 23, 1]","[1, 659, 28, 60, 3, 663, 11, 2, 49, 136, 111, ..."


In [42]:
print('\n--------------------------------------')
print('\t作嵌入矩阵并保存\n')
dim = word_dic['word'].shape[0]
embedding_matrix_token = np.random.randn(len(word2index_token) + 1, dim)
embedding_matrix_token[0] = np.zeros((dim, ), dtype='float32')
for word, index in word2index_token.items():
    if word in word_dic:
        embedding_matrix_token[index] = word_dic[word]
with open(os.path.join(savepath, 'embedding_matrix_token.pkl'), 'wb') as fw:
    pkl.dump(embedding_matrix_token, fw)

    
print('\n--------------------------------------')
print('\t作嵌入矩阵并保存\n')
embedding_matrix_lemma = np.random.randn(len(word2index_lemma) + 1, dim)
embedding_matrix_lemma[0] = np.zeros((dim, ), dtype='float32')
for word, index in word2index_lemma.items():
    if word in word_dic:
        embedding_matrix_lemma[index] = word_dic[word]
with open(os.path.join(savepath, 'embedding_matrix_lemma.pkl'), 'wb') as fw:
    pkl.dump(embedding_matrix_lemma, fw)

    
print('\n--------------------------------------')
print('\t作嵌入矩阵并保存\n')
embedding_matrix_lemma_pro = np.random.randn(len(word2index_lemma_pro) + 1, dim)
embedding_matrix_lemma_pro[0] = np.zeros((dim, ), dtype='float32')
for word, index in word2index_lemma_pro.items():
    if word in word_dic:
        embedding_matrix_lemma_pro[index] = word_dic[word]
with open(os.path.join(savepath, 'embedding_matrix_lemma_pro.pkl'), 'wb') as fw:
    pkl.dump(embedding_matrix_lemma_pro, fw)


--------------------------------------
	作嵌入矩阵并保存


--------------------------------------
	作嵌入矩阵并保存


--------------------------------------
	作嵌入矩阵并保存



In [43]:
for key in keys_lists:
    print(key)
    samples = all_samples[key]
    process_char = lambda sent: pad_sequences(char_tokenizer(sent), maxlen=20, padding='post', truncating='post')
    samples['q_sub_token_char_index'] = samples['q_sub_token'].apply(process_char)
    samples['q_body_token_char_index'] = samples['q_body_token'].apply(process_char)
    samples['cTEXT_token_char_index'] = samples['cTEXT_token'].apply(process_char)
    
    samples['q_sub_lemma_char_index'] = samples['q_sub_lemma'].apply(process_char)
    samples['q_body_lemma_char_index'] = samples['q_body_lemma'].apply(process_char)
    samples['cTEXT_lemma_char_index'] = samples['cTEXT_lemma'].apply(process_char)
    
    samples['q_sub_lemma_pro_char_index'] = samples['q_sub_lemma_pro'].apply(process_char)
    samples['q_body_lemma_pro_char_index'] = samples['q_body_lemma_pro'].apply(process_char)
    samples['cTEXT_lemma_pro_char_index'] = samples['cTEXT_lemma_pro'].apply(process_char)

15dev.xml
15test.xml
15train.xml
16dev.xml
16test.xml
16train1.xml
16train2.xml
17test.xml


In [45]:
all_samples['15dev.xml'].head(2)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid,cate_index,...,cTEXT_lemma_pro_index,q_sub_token_char_index,q_body_token_char_index,cTEXT_token_char_index,q_sub_lemma_char_index,q_body_lemma_char_index,cTEXT_lemma_char_index,q_sub_lemma_pro_char_index,q_body_lemma_pro_char_index,cTEXT_lemma_pro_char_index
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263,9,...,"[1, 24, 13, 20, 421, 122, 15, 308, 13, 17, 569...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604,9,...,"[1, 659, 28, 60, 3, 663, 11, 2, 49, 136, 111, ...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."


In [46]:
print('\n--------------------------------------')
print('\tsave char2index')
print('\t\tthe number of charactor：%d' % len(char2index))
with open(os.path.join(savepath, 'char2index.json'), 'w') as fw:
    json.dump(char2index, fw)


--------------------------------------
	save char2index
		the number of charactor：334


In [48]:
for key in keys_lists:
    print(key)
    samples = all_samples[key]
    
    samples['q_sub_len'] = samples['q_sub_token'].apply(len)
    samples['q_body_len'] = samples['q_body_token'].apply(len)
    samples['cTEXT_len'] = samples['cTEXT_token'].apply(len)
    
    samples['q_sub_pro_len'] = samples['q_sub_lemma_pro'].apply(len)
    samples['q_body_pro_len'] = samples['q_body_lemma_pro'].apply(len)
    samples['cTEXT_pro_len'] = samples['cTEXT_lemma_pro'].apply(len)

15dev.xml
15test.xml
15train.xml
16dev.xml
16test.xml
16train1.xml
16train2.xml
17test.xml


In [50]:
for key in keys_lists:
    print('\n\n', key)
    samples = all_samples[key]
    print('------  q_sub length -------')
    print(samples['q_sub_len'].describe([0.8, 0.9, 0.95, 0.98]))
    print('------  q_body length -------')
    print(samples['q_body_len'].describe([0.8, 0.9, 0.95, 0.98]))
    print('------  cTEXT length -------')
    print(samples['cTEXT_len'].describe([0.8, 0.9, 0.95, 0.98]))
    
    print("==================================================================")
    print('------  q_sub lemma pro length -------')
    print(samples['q_sub_pro_len'].describe([0.8, 0.9, 0.95, 0.98]))
    print('------  q_body lemma pro length -------')
    print(samples['q_body_pro_len'].describe([0.8, 0.9, 0.95, 0.98]))
    print('------  cTEXT lemma pro length -------')
    print(samples['cTEXT_pro_len'].describe([0.8, 0.9, 0.95, 0.98]))



 15dev.xml
------  q_sub length -------
count    1529.000000
mean        7.604971
std         3.755223
min         3.000000
50%         7.000000
80%        11.000000
90%        12.200000
95%        14.000000
98%        18.000000
max        23.000000
Name: q_sub_len, dtype: float64
------  q_body length -------
count    1529.000000
mean       42.107260
std        21.675353
min         5.000000
50%        39.000000
80%        63.000000
90%        75.000000
95%        81.000000
98%        85.000000
max       106.000000
Name: q_body_len, dtype: float64
------  cTEXT length -------
count    1529.000000
mean       36.385219
std        48.396906
min         3.000000
50%        25.000000
80%        52.000000
90%        77.000000
95%        98.000000
98%       132.000000
max      1348.000000
Name: cTEXT_len, dtype: float64
------  q_sub lemma pro length -------
count    1529.000000
mean        7.647482
std         3.789192
min         3.000000
50%         7.000000
80%        10.000000
90%    

Name: q_sub_pro_len, dtype: float64
------  q_body lemma pro length -------
count    3270.000000
mean       50.097859
std        26.400617
min         2.000000
50%        48.000000
80%        77.000000
90%        87.000000
95%        94.000000
98%       100.000000
max       112.000000
Name: q_body_pro_len, dtype: float64
------  cTEXT lemma pro length -------
count    3270.000000
mean       39.858716
std        37.750403
min         3.000000
50%        28.000000
80%        60.000000
90%        86.000000
95%       115.000000
98%       161.000000
max       238.000000
Name: cTEXT_pro_len, dtype: float64


 16train1.xml
------  q_sub length -------
count    14110.000000
mean         8.439405
std          3.583256
min          3.000000
50%          8.000000
80%         11.000000
90%         13.000000
95%         15.000000
98%         18.000000
max         28.000000
Name: q_sub_len, dtype: float64
------  q_body length -------
count    14110.000000
mean        52.907158
std         26.236666

In [51]:
with open(os.path.join('../data', 'datasetFORtri.pkl'), 'wb') as fw:
    pkl.dump(all_samples, fw)