<a href="https://colab.research.google.com/github/anqi-guo/duplicated_complaints_identification/blob/main/3_process_labeled_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import os

data_path = '/content/drive/My Drive/work/重复线索识别/data'

# read data -> option 1

In [19]:
# 人工标注的样本
df_list = []
for file in os.listdir(f'{data_path}/labeled_data/xlsx'):
  dff = pd.read_excel(f'{data_path}/labeled_data/xlsx/{file}')
  dff.columns = ['label', 'sentence1', 'sentence2']
  df_list.append(dff)

# 模型预测错误的样本
df_ = pd.read_excel(f'{data_path}/model_outputs/similar_sentence_pairs.xlsx', usecols=[0,1,2], header=None)
df_.columns = ['label', 'sentence1', 'sentence2']
df_ = df_[df_['label'].isin([0,1])]
df_list.append(df_)

# 合并
df = pd.concat(df_list)

# 删掉包含一下文字的
df = df[~((df['sentence1'].str.contains('前期工单|原工单|已有工单|前单号'))|(df['sentence2'].str.contains('前期工单|原工单|已有工单|前单号')))]

df.shape

(11377, 3)

# read data -> option 2

In [None]:
# 之前已经整理好的数据
df = pd.read_excel(f'{data_path}/labeled_data/final_data.xlsx')

# 模型预测错误的样本
df_ = pd.read_excel(f'{data_path}/model_outputs/similar_sentence_pairs.xlsx', usecols=[0,1,2], header=None)
df_.columns = ['label', 'sentence1', 'sentence2']
df_ = df_[df_['label'].isin([0,1])]

df = df.append(df_)

# delete duplicated sentence pairs

In [16]:
# 删掉重复句子对
def sort_list(sent1, sent2):
  sents_list = list(sent1) + list(sent2)
  sents_list.sort()
  return ''.join(sents_list)

df['word_list'] = df.apply(lambda row: sort_list(row['sentence1'],row['sentence2']), axis=1)

df.drop_duplicates(subset=['word_list'], keep='first', inplace=True)

df.shape

(10995, 4)

# extract key sentence

In [20]:
import regex as re
import jieba
from tqdm.auto import tqdm
tqdm.pandas()

def extract_problem(text):
  # 删掉标点符号
  text = re.sub(r'\p{P}+', '', text) 
  # 删掉空格、换行符、信息保密、信息不保密、距离、时间、数字、投诉人信息
  text = re.sub(r'(\s|\n|信息[不]?保密|\d+[m米年月日]?|(.{1}先生|.{1}女士|市民)(来电)?反映)', '', text) 

  if '问题描述' in text and '诉求目的' in text:
    text = re.search(r'(?<=问题描述).*(?=诉求目的)', text).group()

  # 删掉停用词
  with open(f'{data_path}/stopwords/cn_stopwords.txt') as f:
    stopwords = f.read().splitlines() 

  seg_list = jieba.cut(text)

  result = ''.join([s for s in seg_list if s not in stopwords])
  return result

df['sentence1'] = df['sentence1'].progress_apply(extract_problem)
df['sentence2'] = df['sentence2'].progress_apply(extract_problem)

# save file

In [21]:
df.to_excel(f'{data_path}/labeled_data/final_data.xlsx', index=False)