In [34]:
import os
import re
import string
import spacy
import json
import pandas
import numpy as np
import pickle as pkl
import xml.etree.ElementTree as ET

from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
from process_text import process_text

In [35]:
nlp = spacy.load('en')
relevance2label = {'Good': 0, 'PotentiallyUseful': 1, 'Bad': 2}


In [36]:
def SemEval16or17_sample(filename):
    root = ET.parse(filename).getroot()
    for thread in root.findall('Thread'):
        question = thread.find('RelQuestion')
        
        q_id = question.get('RELQ_ID')
        q_category = question.get('RELQ_CATEGORY')
        q_date = question.get('RELQ_DATE')
        q_userid = question.get('RELQ_USERID')
        q_username = question.get('RELQ_USERNAME')
        
        q_subject = question.find('RelQSubject').text or ''
        q_body = question.find('RelQBody').text or ''
        
        for relcomment in thread.findall('RelComment'):
            c_id = relcomment.get('RELC_ID')
            c_date = relcomment.get('RELC_DATE')
            c_userid = relcomment.get('RELC_USERID')
            c_username = relcomment.get('RELC_USERNAME')
            Relevance = relcomment.get('RELC_RELEVANCE2RELQ')
            cTEXT = relcomment.find('RelCText').text or ''
#             if len(q_subject) == 0 or len(q_body) == 0 or len(cTEXT) == 0:
#                 print(c_id, len(q_subject), len(q_body), len(cTEXT))
#                 continue
            yield [q_id, c_id, q_category, q_subject, q_body, cTEXT, Relevance, q_userid, c_userid]

In [37]:
# create dataFrame
columns = ['q_id', 'c_id', 'q_category', 'q_subject', 'q_body', 'cTEXT', 'Relevance', 'q_userid', 'c_userid']

In [38]:
filepath = '../raw_data'
all_samples = {}
data_filename_list = os.listdir(filepath)

for name in data_filename_list:
    print('\n\t\t处理文件：%s\n' % name)
    filename = os.path.join(filepath, name)
    samples = SemEval16or17_sample(filename)
#     samples = [sample for sample in tqdm(samples)]
    all_samples[name] = pandas.DataFrame(columns=columns, data=samples)
    all_samples[name].head()


		处理文件：15dev.xml


		处理文件：15test.xml


		处理文件：15train.xml


		处理文件：16dev.xml


		处理文件：16test.xml


		处理文件：16train1.xml


		处理文件：16train2.xml


		处理文件：17test.xml



In [39]:
all_samples['15dev.xml'].head(5)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604
2,Q2481,Q2481_C3,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If the company is from Oil and Gas Industry or...,Good,U8902,U2316
3,Q2481,Q2481_C4,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,Transport in the city is a nightmare.,Good,U8902,U5547
4,Q2481,Q2481_C5,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,And life her is so relaxed that bachelors are ...,Bad,U8902,U5547


In [40]:
keys_lists = all_samples.keys()
all_samples_concate = {}
for key in keys_lists:
    print(key)
    samples = all_samples[key]
    
    def mark_func(x):
        rel = x['Relevance']
        q_userid = x['q_userid']
        c_userid = x['c_userid']
        if q_userid == c_userid:
            return 1
        else:
            return 0
    
    def mark_func2(x):
        rel = x['Relevance']
        q_userid = x['q_userid']
        c_userid = x['c_userid']
        if q_userid == c_userid and rel == 'Good':
            return 1
        else:
            return 0
    
    qc_mark = samples.apply(mark_func, axis=1)
    qc_mark2 = samples.apply(mark_func2, axis=1)
    samples['qc_mark'] = qc_mark
    
    print('-----------------------------------')
#     print(qc_mark[:10], qc_mark2[:10])
    print(qc_mark.sum(), qc_mark2.sum())
    print('-----------------------------------\n\n')

15dev.xml
-----------------------------------
191 23
-----------------------------------


15test.xml
-----------------------------------
264 31
-----------------------------------


15train.xml
-----------------------------------
2199 318
-----------------------------------


16dev.xml
-----------------------------------
393 40
-----------------------------------


16test.xml
-----------------------------------
560 54
-----------------------------------


16train1.xml
-----------------------------------
2152 229
-----------------------------------


16train2.xml
-----------------------------------
584 58
-----------------------------------


17test.xml
-----------------------------------
480 67
-----------------------------------




In [41]:
keys_lists = all_samples.keys()
all_samples_concate = {}
for key in keys_lists:
    print(key)
    samples = all_samples[key]
    qc_mark = samples.groupby('qc_mark')
    qc_mark = [a for a in qc_mark]
    qc_mark = {a[0]: a[1] for a in qc_mark}
    print(qc_mark[1]['c_id'])
    print('===================\n\n')

15dev.xml
10      Q2481_C11
16       Q2483_C1
50       Q2491_C2
58       Q2492_C6
60       Q2492_C8
61       Q2492_C9
65       Q2493_C2
67       Q2493_C4
71       Q2494_C2
73       Q2494_C4
79       Q2497_C1
81       Q2497_C3
84       Q2498_C3
95       Q2501_C3
122      Q2507_C7
162      Q2514_C5
186      Q2522_C3
197      Q2526_C2
198      Q2526_C3
200      Q2526_C5
205      Q2527_C3
220      Q2532_C2
235      Q2535_C5
261      Q2538_C3
280      Q2541_C4
282      Q2542_C2
283      Q2542_C3
291      Q2544_C1
298      Q2544_C8
299      Q2544_C9
          ...    
1270    Q2724_C16
1284     Q2729_C2
1289     Q2730_C5
1292     Q2731_C3
1307     Q2733_C8
1308     Q2733_C9
1312     Q2734_C4
1324     Q2737_C2
1325     Q2737_C3
1328     Q2737_C6
1333     Q2738_C4
1344    Q2739_C10
1345    Q2739_C11
1351     Q2740_C5
1359     Q2743_C3
1387     Q2747_C6
1393     Q2749_C1
1395     Q2749_C3
1411     Q2753_C2
1412     Q2753_C3
1415     Q2753_C6
1417     Q2753_C8
1429     Q2755_C9
1431     Q2756_C1


# concate q_subject and q_body

In [42]:
keys_lists = all_samples.keys()
all_samples_concate = {}
for key in keys_lists:
    print(key)
    samples = all_samples[key]
    qTEXT = samples['q_subject'] + ' S_E_P ' + samples['q_body']
    samples['qTEXT'] = qTEXT
    all_samples_concate[key] = samples
    

15dev.xml
15test.xml
15train.xml
16dev.xml
16test.xml
16train1.xml
16train2.xml
17test.xml


In [43]:
print(all_samples_concate['15dev.xml'].iloc[1, :])

q_id                                                      Q2481
c_id                                                   Q2481_C2
q_category                                        Life in Qatar
q_subject                                   from DUBAI to QATAR
q_body        i am currently working here in dubai and i got...
cTEXT         depends on where the accommodation is.. how ma...
Relevance                                                  Good
q_userid                                                  U8902
c_userid                                                   U604
qc_mark                                                       0
qTEXT         from DUBAI to QATAR S_E_P i am currently worki...
Name: 1, dtype: object


In [44]:
# category2index and relevance2index
assist_path = '../assist'

with open(os.path.join(assist_path, 'Qcategory_dic.json')) as fq, \
     open(os.path.join(assist_path, 'Relevance_dic.json')) as fr:
    Qcategory_dic = json.load(fq)
    Relevance_dic = json.load(fr)

In [45]:
import copy

for key in keys_lists:
    print(key)
    samples = all_samples_concate[key]
    samples['cate_index'] = samples['q_category'].apply(lambda x: Qcategory_dic[x])
    samples['rel_index'] = samples['Relevance'].apply(lambda x: Relevance_dic[x])
    Relevance_dic_v2 = copy.copy(Relevance_dic)
    Relevance_dic_v2['Bad'] = 1
    samples['Rrel_index'] = samples['Relevance'].apply(lambda x: Relevance_dic_v2[x])
    

15dev.xml
15test.xml
15train.xml
16dev.xml
16test.xml
16train1.xml
16train2.xml
17test.xml


In [46]:
all_samples_concate['15dev.xml'].head(2)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid,qc_mark,qTEXT,cate_index,rel_index,Rrel_index
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263,0,from DUBAI to QATAR S_E_P i am currently worki...,9,1,1
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604,0,from DUBAI to QATAR S_E_P i am currently worki...,9,0,0


In [47]:
print(all_samples_concate['16train1.xml']['rel_index'].describe())
print(all_samples_concate['16train1.xml']['Rrel_index'].describe())

count    14110.000000
mean         1.076187
std          0.905449
min          0.000000
25%          0.000000
50%          1.000000
75%          2.000000
max          2.000000
Name: rel_index, dtype: float64
count    14110.000000
mean         0.625301
std          0.484062
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: Rrel_index, dtype: float64


In [48]:
test_samples = all_samples_concate['15train.xml']
cond = test_samples['q_id'] == 'Q136'
print(test_samples[cond])


     q_id     c_id           q_category q_subject  \
784  Q136  Q136_C1  Qatar Living Lounge             

                                                q_body       cTEXT Relevance  \
784  Mick Hucknall apologises to 1;000 women he sle...  Who knows?       Bad   

    q_userid c_userid  qc_mark  \
784     U268    U5547        0   

                                                 qTEXT  cate_index  rel_index  \
784   S_E_P Mick Hucknall apologises to 1;000 women...           3          2   

     Rrel_index  
784           1  


In [49]:
# tokenize (with lemm)
import nltk

wn_lemmatizer = nltk.stem.WordNetLemmatizer()

for key in keys_lists:
    print(key)
    samples = all_samples_concate[key]
#     samples['q_sub_token'] = samples['q_subject'].apply(lambda x: nltk.word_tokenize(x))
#     samples['q_body_token'] = samples['q_body'].apply(lambda x: nltk.word_tokenize(x))
    samples['qTEXT_token'] = samples['qTEXT'].apply(lambda x: nltk.word_tokenize(x.lower()))
    samples['cTEXT_token'] = samples['cTEXT'].apply(lambda x: nltk.word_tokenize(x.lower()))
    
    # 计算相对位置序列
    samples['q_position_index'] = samples['qTEXT_token'].apply(lambda x: [i for i in range(len(x))])
    samples['c_position_index'] = samples['cTEXT_token'].apply(lambda x: [i for i in range(len(x))])
    
    # 计算segment
    def seg_transformer(sent):
        index = sent.index('s_e_p')
        return [0]*(index+1) + [1]*(len(sent)-index-1)
    
    samples['q_segment_index'] = samples['qTEXT_token'].apply(seg_transformer)

    
#     samples['q_sub_lemma'] = samples['q_sub_token'].apply(lambda tokens: [wn_lemmatizer.lemmatize(x) for x in tokens])
#     samples['q_body_lemma'] = samples['q_body_token'].apply(lambda tokens: [wn_lemmatizer.lemmatize(x) for x in tokens])
    samples['qTEXT_lemma'] = samples['qTEXT_token'].apply(lambda tokens: [wn_lemmatizer.lemmatize(x) for x in tokens])
    samples['cTEXT_lemma'] = samples['cTEXT_token'].apply(lambda tokens: [wn_lemmatizer.lemmatize(x) for x in tokens])
    
    samples['qTEXT_lemma_pro'] = samples['qTEXT'].apply(process_text)
    samples['cTEXT_lemma_pro'] = samples['cTEXT'].apply(process_text)
    

15dev.xml
15test.xml
15train.xml
16dev.xml
16test.xml
16train1.xml
16train2.xml
17test.xml


In [50]:
all_samples_concate['15dev.xml'].head(2)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid,qc_mark,...,Rrel_index,qTEXT_token,cTEXT_token,q_position_index,c_position_index,q_segment_index,qTEXT_lemma,cTEXT_lemma,qTEXT_lemma_pro,cTEXT_lemma_pro
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263,0,...,1,"[from, dubai, to, qatar, s_e_p, i, am, current...","[if, you, are, single, then, its, ok, you, can...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[from, dubai, to, qatar, s_e_p, i, am, current...","[if, you, are, single, then, it, ok, you, can,...","[from, dubai, to, qatar, s, _, e, _, p, i, am,...","[if, you, are, single, then, it, ok, you, can,..."
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604,0,...,0,"[from, dubai, to, qatar, s_e_p, i, am, current...","[depends, on, where, the, accommodation, is..,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[from, dubai, to, qatar, s_e_p, i, am, current...","[depends, on, where, the, accommodation, is..,...","[from, dubai, to, qatar, s, _, e, _, p, i, am,...","[depends, on, where, the, accommodation, is, ...."


In [51]:
char2index = {key: value+1 for value, key in enumerate(string.ascii_letters + string.digits + string.punctuation)}
char2index['//'] = len(char2index) + 1

def count_word_number(text, word_count):
    for token in text:
        if token in word_count:
            word_count[token] += 1
        else:
            word_count[token] = 1

def char_tokenizer(text):
    char_text = []
    for token in text:
        token_ = []
        if token == 's_e_p':
            token_.append(char2index['//'])
        else:
            for c in token:
                if c not in char2index:
                    char2index[c] = len(char2index)+1
                token_.append(char2index[c])
        char_text.append(token_)
    return char_text

def load_glove(filename):
    '''

    2018-11-14: add supporting word2vector

    '''
    word_dic = {}
    if 'glove' in filename:
        print('\nload word dictionary starting!')

        with open(filename, encoding='utf-8') as fr:
            lines = [line for line in fr]
            for line in lines:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                word_dic[word] = coefs

        print('load word dictionary ending!\n')
    else:
        print('\nload word dictionary starting!')
        with open(filename, 'rb') as fr:
            word_dic = pkl.load(fr, encoding='bytes')
        print('load word dictionary ending!\n')

    return word_dic

In [52]:
word_count_token = {}
word_count_lemma = {}
word_count_lemma_pro = {}

word_dic = load_glove('../assist/glove.vectors.pro2.win20.txt')

for key in keys_lists:
    print(key)
    samples = all_samples_concate[key]
    samples['qTEXT_token'].apply(count_word_number, args=(word_count_token,))
    samples['cTEXT_token'].apply(count_word_number, args=(word_count_token,))
    
    samples['qTEXT_lemma'].apply(count_word_number, args=(word_count_lemma,))
    samples['cTEXT_lemma'].apply(count_word_number, args=(word_count_lemma,))
    
    samples['qTEXT_lemma_pro'].apply(count_word_number, args=(word_count_lemma_pro,))
    samples['cTEXT_lemma_pro'].apply(count_word_number, args=(word_count_lemma_pro,))



load word dictionary starting!
load word dictionary ending!

15dev.xml
15test.xml
15train.xml
16dev.xml
16test.xml
16train1.xml
16train2.xml
17test.xml


In [53]:
savepath = '../assist'
max_vocab = 10000  # 最大词汇数量

print('\n--------------------------------------')
word_count_ = sorted(word_count_token.items(), key=lambda x: x[1], reverse=True)
print('\t获得word2index并保存, 总词数为{}'.format(len(word_count_)))
with open(os.path.join(savepath, 'word_count_token.json'), 'w') as fw:
    json.dump(word_count_, fw)

word_count_ = word_count_[:max_vocab]
word2index_token = {word: index + 1 for index, (word, _) in enumerate(word_count_)}
print('\t\t总词数为：%d' % len(word2index_token))
with open(os.path.join(savepath, 'word2index_token.json'), 'w') as fw:
    json.dump(word2index_token, fw)

    
print('\n--------------------------------------')
word_count_ = sorted(word_count_lemma.items(), key=lambda x: x[1], reverse=True)
print('\t获得word2index并保存, 总词数为{}'.format(len(word_count_)))
with open(os.path.join(savepath, 'word_count_lemma.json'), 'w') as fw:
    json.dump(word_count_, fw)

word_count_ = word_count_[:max_vocab]
word2index_lemma = {word: index + 1 for index, (word, _) in enumerate(word_count_)}
print('\t\t总词数为：%d' % len(word2index_lemma))
with open(os.path.join(savepath, 'word2index_lemma.json'), 'w') as fw:
    json.dump(word2index_lemma, fw)

    
print('\n--------------------------------------')
word_count_ = sorted(word_count_lemma_pro.items(), key=lambda x: x[1], reverse=True)
print('\t获得word2index并保存, 总词数为{}'.format(len(word_count_)))
with open(os.path.join(savepath, 'word_count_lemma_pro.json'), 'w') as fw:
    json.dump(word_count_, fw)

word_count_ = word_count_[:max_vocab]
word2index_lemma_pro = {word: index + 1 for index, (word, _) in enumerate(word_count_)}
print('\t\t总词数为：%d' % len(word2index_lemma_pro))
with open(os.path.join(savepath, 'word2index_lemma_pro.json'), 'w') as fw:
    json.dump(word2index_lemma_pro, fw)


--------------------------------------
	获得word2index并保存, 总词数为66584
		总词数为：10000

--------------------------------------
	获得word2index并保存, 总词数为62735
		总词数为：10000

--------------------------------------
	获得word2index并保存, 总词数为41543
		总词数为：10000


In [54]:
all_samples_concate['15dev.xml'].head(2)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid,qc_mark,...,Rrel_index,qTEXT_token,cTEXT_token,q_position_index,c_position_index,q_segment_index,qTEXT_lemma,cTEXT_lemma,qTEXT_lemma_pro,cTEXT_lemma_pro
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263,0,...,1,"[from, dubai, to, qatar, s_e_p, i, am, current...","[if, you, are, single, then, its, ok, you, can...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[from, dubai, to, qatar, s_e_p, i, am, current...","[if, you, are, single, then, it, ok, you, can,...","[from, dubai, to, qatar, s, _, e, _, p, i, am,...","[if, you, are, single, then, it, ok, you, can,..."
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604,0,...,0,"[from, dubai, to, qatar, s_e_p, i, am, current...","[depends, on, where, the, accommodation, is..,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[from, dubai, to, qatar, s_e_p, i, am, current...","[depends, on, where, the, accommodation, is..,...","[from, dubai, to, qatar, s, _, e, _, p, i, am,...","[depends, on, where, the, accommodation, is, ...."


In [55]:
for key in keys_lists:
    print(key)
    samples = all_samples_concate[key]
    token_count = len(word2index_token)
    token_replace_func = lambda sent: [word2index_token.get(x, token_count+1) 
#                                        if x != "s_e_p" else word2index_token["[SEP]"] 
                                       for x in sent]
    samples['qTEXT_token_index'] = samples['qTEXT_token'].apply(token_replace_func)
    samples['cTEXT_token_index'] = samples['cTEXT_token'].apply(token_replace_func)
    
    lemma_count = len(word2index_lemma)
    lemma_replace_func = lambda sent: [word2index_lemma.get(x, lemma_count+1)
#                                        if x != "s_e_p" else word2index_lemma["[SEP]"]
                                       for x in sent]
    samples['qTEXT_lemma_index'] = samples['qTEXT_lemma'].apply(lemma_replace_func)
    samples['cTEXT_lemma_index'] = samples['cTEXT_lemma'].apply(lemma_replace_func)
    
    lemma_pro_count = len(word2index_lemma_pro)
    lemma_pro_replace_func = lambda sent: [word2index_lemma_pro.get(x, lemma_pro_count+1)
#                                            if x != "s_e_p" else word2index_lemma_pro["[SEP]"]
                                           for x in sent]
    samples['qTEXT_lemma_pro_index'] = samples['qTEXT_lemma_pro'].apply(lemma_pro_replace_func)
    samples['cTEXT_lemma_pro_index'] = samples['cTEXT_lemma_pro'].apply(lemma_pro_replace_func)

15dev.xml
15test.xml
15train.xml
16dev.xml
16test.xml
16train1.xml
16train2.xml
17test.xml


In [56]:
all_samples_concate['15dev.xml'].head(2)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid,qc_mark,...,qTEXT_lemma,cTEXT_lemma,qTEXT_lemma_pro,cTEXT_lemma_pro,qTEXT_token_index,cTEXT_token_index,qTEXT_lemma_index,cTEXT_lemma_index,qTEXT_lemma_pro_index,cTEXT_lemma_pro_index
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263,0,...,"[from, dubai, to, qatar, s_e_p, i, am, current...","[if, you, are, single, then, it, ok, you, can,...","[from, dubai, to, qatar, s, _, e, _, p, i, am,...","[if, you, are, single, then, it, ok, you, can,...","[42, 210, 4, 23, 13, 5, 58, 433, 151, 50, 8, 2...","[24, 11, 21, 426, 119, 97, 345, 11, 18, 550, 1]","[42, 219, 4, 23, 14, 5, 57, 457, 156, 50, 8, 2...","[24, 11, 21, 429, 124, 13, 350, 11, 18, 572, 1]","[45, 221, 4, 26, 15, 6, 18, 6, 16, 3, 60, 458,...","[27, 13, 23, 421, 125, 17, 308, 13, 20, 568, 1]"
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604,0,...,"[from, dubai, to, qatar, s_e_p, i, am, current...","[depends, on, where, the, accommodation, is..,...","[from, dubai, to, qatar, s, _, e, _, p, i, am,...","[depends, on, where, the, accommodation, is, ....","[42, 210, 4, 23, 13, 5, 58, 433, 151, 50, 8, 2...","[634, 30, 60, 3, 682, 5502, 48, 129, 110, 38, ...","[42, 219, 4, 23, 14, 5, 57, 457, 156, 50, 8, 2...","[655, 30, 59, 3, 668, 5050, 48, 134, 111, 38, ...","[45, 221, 4, 26, 15, 6, 18, 6, 16, 3, 60, 458,...","[658, 31, 63, 2, 662, 11, 1, 52, 139, 114, 42,..."


In [57]:
print('\n--------------------------------------')
print('\t作嵌入矩阵并保存\n')
dim = word_dic['word'].shape[0]
embedding_matrix_token = np.random.randn(len(word2index_token) + 1, dim)
embedding_matrix_token[0] = np.zeros((dim, ), dtype='float32')
for word, index in word2index_token.items():
    if word in word_dic:
        embedding_matrix_token[index] = word_dic[word]
    if word == "s_e_p":
        embedding_matrix_token[index] = word_dic["[SEP]"]
with open(os.path.join(savepath, 'embedding_matrix_token.pkl'), 'wb') as fw:
    pkl.dump(embedding_matrix_token, fw)

    
print('\n--------------------------------------')
print('\t作嵌入矩阵并保存\n')
embedding_matrix_lemma = np.random.randn(len(word2index_lemma) + 1, dim)
embedding_matrix_lemma[0] = np.zeros((dim, ), dtype='float32')
for word, index in word2index_lemma.items():
    if word in word_dic:
        embedding_matrix_lemma[index] = word_dic[word]
    if word == "s_e_p":
        embedding_matrix_lemma[index] = word_dic["[SEP]"]
with open(os.path.join(savepath, 'embedding_matrix_lemma.pkl'), 'wb') as fw:
    pkl.dump(embedding_matrix_lemma, fw)

    
print('\n--------------------------------------')
print('\t作嵌入矩阵并保存\n')
embedding_matrix_lemma_pro = np.random.randn(len(word2index_lemma_pro) + 1, dim)
embedding_matrix_lemma_pro[0] = np.zeros((dim, ), dtype='float32')
for word, index in word2index_lemma_pro.items():
    if word in word_dic:
        embedding_matrix_lemma_pro[index] = word_dic[word]
    if word == "s_e_p":
        embedding_matrix_lemma_pro[index] = word_dic["[SEP]"]
with open(os.path.join(savepath, 'embedding_matrix_lemma_pro.pkl'), 'wb') as fw:
    pkl.dump(embedding_matrix_lemma_pro, fw)


--------------------------------------
	作嵌入矩阵并保存


--------------------------------------
	作嵌入矩阵并保存


--------------------------------------
	作嵌入矩阵并保存



In [58]:
all_samples_concate['15dev.xml'].head(2)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid,qc_mark,...,qTEXT_lemma,cTEXT_lemma,qTEXT_lemma_pro,cTEXT_lemma_pro,qTEXT_token_index,cTEXT_token_index,qTEXT_lemma_index,cTEXT_lemma_index,qTEXT_lemma_pro_index,cTEXT_lemma_pro_index
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263,0,...,"[from, dubai, to, qatar, s_e_p, i, am, current...","[if, you, are, single, then, it, ok, you, can,...","[from, dubai, to, qatar, s, _, e, _, p, i, am,...","[if, you, are, single, then, it, ok, you, can,...","[42, 210, 4, 23, 13, 5, 58, 433, 151, 50, 8, 2...","[24, 11, 21, 426, 119, 97, 345, 11, 18, 550, 1]","[42, 219, 4, 23, 14, 5, 57, 457, 156, 50, 8, 2...","[24, 11, 21, 429, 124, 13, 350, 11, 18, 572, 1]","[45, 221, 4, 26, 15, 6, 18, 6, 16, 3, 60, 458,...","[27, 13, 23, 421, 125, 17, 308, 13, 20, 568, 1]"
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604,0,...,"[from, dubai, to, qatar, s_e_p, i, am, current...","[depends, on, where, the, accommodation, is..,...","[from, dubai, to, qatar, s, _, e, _, p, i, am,...","[depends, on, where, the, accommodation, is, ....","[42, 210, 4, 23, 13, 5, 58, 433, 151, 50, 8, 2...","[634, 30, 60, 3, 682, 5502, 48, 129, 110, 38, ...","[42, 219, 4, 23, 14, 5, 57, 457, 156, 50, 8, 2...","[655, 30, 59, 3, 668, 5050, 48, 134, 111, 38, ...","[45, 221, 4, 26, 15, 6, 18, 6, 16, 3, 60, 458,...","[658, 31, 63, 2, 662, 11, 1, 52, 139, 114, 42,..."


In [59]:
for key in keys_lists:
    print(key)
    samples = all_samples_concate[key]
    process_char = lambda sent: pad_sequences(char_tokenizer(sent), maxlen=20, padding='post', truncating='post')
    samples['qTEXT_token_char_index'] = samples['qTEXT_token'].apply(process_char)
    samples['cTEXT_token_char_index'] = samples['cTEXT_token'].apply(process_char)
    
    samples['qTEXT_lemma_char_index'] = samples['qTEXT_lemma'].apply(process_char)
    samples['cTEXT_lemma_char_index'] = samples['cTEXT_lemma'].apply(process_char)
    
    samples['qTEXT_lemma_pro_char_index'] = samples['qTEXT_lemma_pro'].apply(process_char)
    samples['cTEXT_lemma_pro_char_index'] = samples['cTEXT_lemma_pro'].apply(process_char)
    

15dev.xml
15test.xml
15train.xml
16dev.xml
16test.xml
16train1.xml
16train2.xml
17test.xml


In [60]:
all_samples_concate['15dev.xml'].head(2)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid,qc_mark,...,qTEXT_lemma_index,cTEXT_lemma_index,qTEXT_lemma_pro_index,cTEXT_lemma_pro_index,qTEXT_token_char_index,cTEXT_token_char_index,qTEXT_lemma_char_index,cTEXT_lemma_char_index,qTEXT_lemma_pro_char_index,cTEXT_lemma_pro_char_index
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263,0,...,"[42, 219, 4, 23, 14, 5, 57, 457, 156, 50, 8, 2...","[24, 11, 21, 429, 124, 13, 350, 11, 18, 572, 1]","[45, 221, 4, 26, 15, 6, 18, 6, 16, 3, 60, 458,...","[27, 13, 23, 421, 125, 17, 308, 13, 20, 568, 1]","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[9, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[9, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[9, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604,0,...,"[42, 219, 4, 23, 14, 5, 57, 457, 156, 50, 8, 2...","[655, 30, 59, 3, 668, 5050, 48, 134, 111, 38, ...","[45, 221, 4, 26, 15, 6, 18, 6, 16, 3, 60, 458,...","[658, 31, 63, 2, 662, 11, 1, 52, 139, 114, 42,...","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[4, 5, 16, 5, 14, 4, 19, 0, 0, 0, 0, 0, 0, 0,...","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[4, 5, 16, 5, 14, 4, 19, 0, 0, 0, 0, 0, 0, 0,...","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[4, 5, 16, 5, 14, 4, 19, 0, 0, 0, 0, 0, 0, 0,..."


In [61]:
print('\n--------------------------------------')
print('\tsave char2index')
print('\t\tthe number of charactor：%d' % len(char2index))
with open(os.path.join(savepath, 'char2index.json'), 'w') as fw:
    json.dump(char2index, fw)


--------------------------------------
	save char2index
		the number of charactor：334


In [62]:
for key in keys_lists:
    print(key)
    samples = all_samples_concate[key]
    
    samples['qTEXT_len'] = samples['qTEXT_token'].apply(len)
    samples['cTEXT_len'] = samples['cTEXT_token'].apply(len)
    
    samples['qTEXT_pro_len'] = samples['qTEXT_lemma_pro'].apply(len)
    samples['cTEXT_pro_len'] = samples['cTEXT_lemma_pro'].apply(len)
    

15dev.xml
15test.xml
15train.xml
16dev.xml
16test.xml
16train1.xml
16train2.xml
17test.xml


In [63]:
all_samples_concate['15dev.xml'].head(2)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid,qc_mark,...,qTEXT_token_char_index,cTEXT_token_char_index,qTEXT_lemma_char_index,cTEXT_lemma_char_index,qTEXT_lemma_pro_char_index,cTEXT_lemma_pro_char_index,qTEXT_len,cTEXT_len,qTEXT_pro_len,cTEXT_pro_len
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263,0,...,"[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[9, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[9, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[9, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",52,11,60,11
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604,0,...,"[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[4, 5, 16, 5, 14, 4, 19, 0, 0, 0, 0, 0, 0, 0,...","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[4, 5, 16, 5, 14, 4, 19, 0, 0, 0, 0, 0, 0, 0,...","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[4, 5, 16, 5, 14, 4, 19, 0, 0, 0, 0, 0, 0, 0,...",52,61,60,64


In [64]:
for key in keys_lists:
    print('\n\n', key)
    samples = all_samples_concate[key]
    print('------  qTEXT length -------')
    print(samples['qTEXT_len'].describe([0.8, 0.9, 0.95, 0.98]))
    print('------  cTEXT length -------')
    print(samples['cTEXT_len'].describe([0.8, 0.9, 0.95, 0.98]))
    
    print('------  qTEXT lemma pro length -------')
    print(samples['qTEXT_pro_len'].describe([0.8, 0.9, 0.95, 0.98]))
    print('------  cTEXT lemma pro length -------')
    print(samples['cTEXT_pro_len'].describe([0.8, 0.9, 0.95, 0.98]))



 15dev.xml
------  qTEXT length -------
count    1529.000000
mean       46.712230
std        21.728775
min         9.000000
50%        45.000000
80%        68.000000
90%        79.000000
95%        84.000000
98%        91.000000
max       108.000000
Name: qTEXT_len, dtype: float64
------  cTEXT length -------
count    1529.000000
mean       34.386527
std        48.396544
min         1.000000
50%        23.000000
80%        50.000000
90%        75.000000
95%        96.000000
98%       130.000000
max      1346.000000
Name: cTEXT_len, dtype: float64
------  qTEXT lemma pro length -------
count    1529.000000
mean       51.325049
std        22.003680
min        13.000000
50%        49.000000
80%        73.000000
90%        84.200000
95%        90.000000
98%        96.000000
max       118.000000
Name: qTEXT_pro_len, dtype: float64
------  cTEXT lemma pro length -------
count    1529.000000
mean       33.523872
std        48.462712
min         1.000000
50%        22.000000
80%        49.00

In [65]:
with open(os.path.join('../data', 'dataset.pkl'), 'wb') as fw:
    pkl.dump(all_samples_concate, fw)

In [66]:
all_samples_concate['15dev.xml'].head(2)

Unnamed: 0,q_id,c_id,q_category,q_subject,q_body,cTEXT,Relevance,q_userid,c_userid,qc_mark,...,qTEXT_token_char_index,cTEXT_token_char_index,qTEXT_lemma_char_index,cTEXT_lemma_char_index,qTEXT_lemma_pro_char_index,cTEXT_lemma_pro_char_index,qTEXT_len,cTEXT_len,qTEXT_pro_len,cTEXT_pro_len
0,Q2481,Q2481_C1,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,If you are single then its ok you can enjoy.,PotentiallyUseful,U8902,U7263,0,...,"[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[9, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[9, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[9, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",52,11,60,11
1,Q2481,Q2481_C2,Life in Qatar,from DUBAI to QATAR,i am currently working here in dubai and i got...,depends on where the accommodation is.. how ma...,Good,U8902,U604,0,...,"[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[4, 5, 16, 5, 14, 4, 19, 0, 0, 0, 0, 0, 0, 0,...","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[4, 5, 16, 5, 14, 4, 19, 0, 0, 0, 0, 0, 0, 0,...","[[6, 18, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[4, 5, 16, 5, 14, 4, 19, 0, 0, 0, 0, 0, 0, 0,...",52,61,60,64
