In [1]:
import pandas as pd
import pycorrector as correct
import random
import jieba
import os
from pyltp import Segmentor
from random import shuffle

In [2]:
train_query=pd.read_csv('./train/train.query.tsv',sep='\t',header=None)
train_reply=pd.read_csv('./train/train.reply.tsv',sep='\t',header=None)

train_query.columns=['id','query']
train_reply.columns=['id','idx','reply','label']
train_data=pd.merge(train_query,train_reply,how='left',on='id')
train_data['reply'] = train_data['reply'].fillna('好的')

In [3]:
train_data

Unnamed: 0,id,query,idx,reply,label
0,0,采荷一小是分校吧,0,杭州市采荷第一小学钱江苑校区，杭州市钱江新城实验学校。,1
1,0,采荷一小是分校吧,1,是的,0
2,0,采荷一小是分校吧,2,这是5楼,0
3,1,毛坯吗？,0,因为公积金贷款贷的少,0
4,1,毛坯吗？,1,是呢,0
...,...,...,...,...,...
21580,5998,您好，我正在看尚林家园的房子,1,有啊,0
21581,5998,您好，我正在看尚林家园的房子,2,我带你看看,0
21582,5999,今天可以安排看房子吗？,0,我约下房东，稍后回你,1
21583,5999,今天可以安排看房子吗？,1,可以看，你几点有时间过来呢？,1


In [4]:
pos_data=train_data[train_data['label'].isin([1])]

In [5]:
pos_data

Unnamed: 0,id,query,idx,reply,label
0,0,采荷一小是分校吧,0,杭州市采荷第一小学钱江苑校区，杭州市钱江新城实验学校。,1
9,2,你们的佣金费大约是多少和契税是多少。,1,所有费用下来654万,1
10,2,你们的佣金费大约是多少和契税是多少。,2,包含着税费和我们的服务费和房款,1
14,3,靠近川沙路嘛？,1,有一点靠近川沙路,1
16,4,这套房源价格还有优惠空间吗？,0,有,1
...,...,...,...,...,...
21567,5994,价格能谈多少,1,满五年唯一住房，能谈,1
21572,5996,对口什么小学,0,应该是长丰小学,1
21575,5997,可以贷公积金吗,1,可以的,1
21582,5999,今天可以安排看房子吗？,0,我约下房东，稍后回你,1


In [None]:
# count=0
# for index,row in pos_data.iterrows():
#     correct_query, detail=correct.correct(row['query'])
#     correct_reply, detail=correct.correct(row['reply'])
#     if correct_query!=row['query'] or correct_reply!=row['reply']:
# #         print('oriquery:{}  orireply:{}'.format(row['query'],row['reply']))
# #         print('correctquery:{}  correctreply:{}'.format(correct_query,correct_reply))
#         count+=1
# print(count)

In [6]:
class NearFormReplacer:
#错别字
    def __init__(self, NearForm_file_path):
        self.NearForm = self.load_NearForm(NearForm_file_path)

    def segment(self, sentence):
        """将一句话拆成字符并以list形式返回"""
        list = []
        for x in sentence:
            list.append(x)
        return list

    def load_NearForm(self, file_path):
        """
        加载形近字表
        :param file_path: 形近字表路径
        :return: 形近字表[[xx,xx],[xx,xx]...]
        """
        NearForm = []
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                NearForm.append(line.strip().split(' '))
        return NearForm

    def get_nearform_sents_list(self, input_sentence):
        """
        产生错别字
        :param input_sentence: 需要制造错别字的原始句子
        :return:
        """
        assert len(input_sentence) > 0, "Length of sentence must greater than 0."
        seged_sentence = self.segment(input_sentence)
        
        #随机变化一个错别字
        change = random.randrange(0, len(seged_sentence),1)
        word = seged_sentence[change]
        seged_sentence1 = seged_sentence
        nearform_sent_list = []
        for nf in self.NearForm:  # 遍历形近字表，为其中的一条
            if word in nf:  # 如果句子中的词在形近字表某一条目中，将该条目中它的形近字添加到该词的形近字列表中
                nf.remove(word)
                if(nf):
                    word1 = random.choice(nf)
                    seged_sentence1[change] = word1
                nearform_sent_list.append(''.join(seged_sentence1))
                break    #一个字可能在好几个行，只要进行一次处理即可
        if len(nearform_sent_list)==0:
            nearform_sent_list.append(input_sentence)
        return nearform_sent_list


class SynonymsReplacer:
#同义词替换
    def __init__(self, synonyms_file_path, cws_model_path):
        self.synonyms = self.load_synonyms(synonyms_file_path)
        self.segmentor = self.load_segmentor(cws_model_path)

    def __del__(self):
        """对象销毁时要释放pyltp分词模型"""
        self.segmentor.release()

    def load_segmentor(self, cws_model_path):
        """
        加载ltp分词模型
        :param cws_model_path: 分词模型路径
        :return: 分词器对象
        """
        segmentor = Segmentor()
        segmentor.load(cws_model_path)
        return segmentor

    def segment(self, sentence):
        """调用pyltp的分词方法将str类型的句子分词并以list形式返回"""
        return list(self.segmentor.segment(sentence))

    def load_synonyms(self, file_path):
        """
        加载同义词表
        :param file_path: 同义词表路径
        :return: 同义词列表[[xx,xx],[xx,xx]...]
        """
        synonyms = []
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                synonyms.append(line.strip().split(' '))
        return synonyms

    def get_syno_sents_list(self, input_sentence):
        """
        产生同义句，并返回同义句列表，返回的同义句列表没有包含该句本身
        :param input_sentence: 需要制造同义句的原始句子
        :return:
        """
        assert len(input_sentence) > 0, "Length of sentence must greater than 0."
        seged_sentence = self.segment(input_sentence)
        #随机变化一个同义词
        change = random.randrange(0, len(seged_sentence),1)
        word = seged_sentence[change]
        word_synonyms = [word]  # 初始化一个词的同义词列表
        for syn in self.synonyms:  # 遍历同义词表，syn为其中的一条
            if word in syn:  # 如果句子中的词在同义词表某一条目中，将该条目中它的同义词添加到该词的同义词列表中
                syn.remove(word)
                if(syn):
                    word1 = random.choice(syn)
                    seged_sentence[change] = word1
                    break
        return ''.join(seged_sentence)

In [7]:
#数据增强
class NlpEda:
    def __init__(self):
        pass
    def synonyms(self, segment):
        LTP_DATA_DIR = './ltp_data_v3.4.0/'  # ltp模型目录的路径
        cws_model_path1 = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径，模型名称为`cws.model`
        synonyms_file_path1 = os.path.join(LTP_DATA_DIR, 'cilin_ex_nonum') #同义词集路径
        replacer = SynonymsReplacer(synonyms_file_path=synonyms_file_path1, cws_model_path=cws_model_path1)
        test_sentence = segment
        _syn = replacer.get_syno_sents_list(test_sentence)
        return _syn
    def nearform(self, segment):
        LTP_DATA_DIR = './ltp_data_v3.4.0/'  # ltp模型目录的路径
        NearForm_file_path1 = os.path.join(LTP_DATA_DIR, 'newShape.txt') #形近字集路径
        replacer = NearFormReplacer(NearForm_file_path=NearForm_file_path1)
        test_sentence = segment
        _syn = replacer.get_nearform_sents_list(test_sentence)
        return _syn
    # 数据增强
    def synonyms_eda_list(self, seg_list):
        ret_list = []
        for seg in seg_list:
            new_seg = self.synonyms(seg)
            ret_list.append(new_seg)
        return ret_list
    def nearform_eda_list(self, seg_list):
        ret_list = []
        for seg in seg_list:
            new_seg = self.nearform(seg)
            ret_list.extend(new_seg)
        return ret_list

In [13]:
final_data

Unnamed: 0,id,query,idx,reply,label
0,0,采荷一小是分校吧,0,杭州市采荷第一完小钱江苑校区，杭州市钱江新城实验学校。,1
1,2,你们的佣金费大约是多少和契税是多少。,1,所有费用下束654万,1
2,2,你们的佣金费大约是多少和数契税是多少。,2,带有着税费和我们的服务费和房款,1
3,3,靠近川沙路嘛？,1,有一点靠近川沙路,1
4,4,这套房源价钱还有优惠空间吗？,0,部分,1
...,...,...,...,...,...
21580,5998,您好，我正在看尚林家园的房子,1,有啊,0
21581,5998,您好，我正在看尚林家园的房子,2,我带你看看,0
21582,5999,今天可以安排看房子吗？,0,我约下房东，稍后回你,1
21583,5999,今天可以安排看房子吗？,1,可以看，你几点有时间过来呢？,1


In [12]:
final_data=pd.concat([df_enhance,train_data],axis=0)

In [14]:
final_data.to_csv('./train/train.csv',index=False)

In [8]:
eda = NlpEda()

In [9]:

df_enhance = pd.DataFrame(columns=['id','query','idx','reply','label'])
for index,row in pos_data.iterrows():
    eda_query_synonyms=eda.synonyms_eda_list([row['query']])[0]
    eda_reply_synonyms=eda.synonyms_eda_list([row['reply']])[0]
    eda_query_nearform=eda.nearform_eda_list([row['query']])[0]
    eda_reply_nearform=eda.nearform_eda_list([row['reply']])[0]
    mode=random.randint(1, 3)
    if mode==1:
        query=eda_query_synonyms
    elif mode==2:
        query=eda_query_nearform
    else:
        query=row['query']
    #这里生成reply
    mode=random.randint(1, 3)
    if mode==1:
        reply=eda_reply_synonyms
    elif mode==2:
        reply=eda_reply_nearform
    else:
        reply=row['reply']
        
    df_enhance=df_enhance.append({'id':row['id'],'query':query,'idx':row['idx'],'reply':reply,'label':1},ignore_index=True)


In [None]:
# pos_query=pos_data['query'].to_list()
# pos_reply=pos_data['reply'].to_list()

In [None]:
# eda_pos_query=eda.synonyms_eda_list(pos_query)

In [None]:
# eda_pos_reply=eda.synonyms_eda_list(pos_reply)

In [None]:
# eda_pos_query_nearform=eda.nearform_eda_list(pos_query) 

In [None]:
# eda_pos_reply_nearform=eda.nearform_eda_list(pos_reply)

In [None]:
# len(eda_pos_reply_nearform) 

In [None]:
# df_enhance = pd.DataFrame(columns=['query','reply','label'])
# for i in range(5043):
#     #首先生成query
#     mode=random.randint(1, 3)
#     if mode==1:
#         query=eda_pos_query[i]
#     elif mode==2:
#         query=eda_pos_query_nearform[i]
#     else:
#         query=pos_query[i]
#     #这里生成reply
#     mode=random.randint(1, 3)
#     if mode==1:
#         reply=eda_pos_reply[i]
#     elif mode==2:
#         reply=eda_pos_reply_nearform[i]
#     else:
#         reply=pos_reply[i]
        
#     df_enhance=df_enhance.append({'query':query,'reply':reply,'label':1},ignore_index=True)
        
    

In [None]:
df_enhance

In [None]:
final_data=pd.concat([df_enhance,train_data],axis=0)

In [None]:
final_data=final_data.sample(frac=1).reset_index(drop=True)

In [None]:
final_data

In [None]:
final_data.to_csv('./train/train.csv',index=False)

In [None]:
df_enhance = pd.DataFrame(columns=['query','reply','label'])

In [None]:
df_enhance

In [None]:
df_enhance=df_enhance.append({'query':1,'reply':1,'label':1},ignore_index=True)

In [None]:
df_enhance

In [None]:
data = pd.DataFrame()
a = {"x":1,"y":2}
data = data.append(a,ignore_index=True)
print(data)

In [None]:
a='sdf'


In [None]:
[a]