<a href="https://colab.research.google.com/github/anqi-guo/duplicated_complaints_identification/blob/main/2_preprocess_unlabeled_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

raw_data_path = '/content/drive/My Drive/work/重复线索识别/data/raw_data'
stopwords_path = '/content/drive/My Drive/work/重复线索识别/data/stopwords'
unlabeled_data_path = '/content/drive/My Drive/work/重复线索识别/data/unlabeled_data'

import pandas as pd
orig_df = pd.read_csv(f'{raw_data_path}/raw_data.csv')
orig_df.shape

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  exec(code_obj, self.user_global_ns, self.user_ns)


(80101, 16)

In [None]:
df = orig_df[~orig_df['XSNR'].str.contains('前期工单|原工单|已有工单|前单号')][['BH','XSBH','WRLX','DSMC','XZQH','XSNR']]
df.shape

(71849, 6)

In [None]:
!pip install hanlp[full]
!pip install sentence_transformers
!pip install folium==0.2.1
!pip install imgaug==0.2.5

Collecting folium==0.2.1
  Downloading folium-0.2.1.tar.gz (69 kB)
[K     |████████████████████████████████| 69 kB 3.5 MB/s 
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25l[?25hdone
  Created wheel for folium: filename=folium-0.2.1-py3-none-any.whl size=79808 sha256=dea0c5990716740bf0090e2d4415b3f91e2be9d20e795b1e637f3981918cb6f7
  Stored in directory: /root/.cache/pip/wheels/9a/f0/3a/3f79a6914ff5affaf50cabad60c9f4d565283283c97f0bdccf
Successfully built folium
Installing collected packages: folium
  Attempting uninstall: folium
    Found existing installation: folium 0.8.3
    Uninstalling folium-0.8.3:
      Successfully uninstalled folium-0.8.3
Successfully installed folium-0.2.1
Collecting imgaug==0.2.5
  Downloading imgaug-0.2.5.tar.gz (562 kB)
[K     |████████████████████████████████| 562 kB 5.4 MB/s 
Building wheels for collected packages: imgaug
  Building wheel for imgaug (setup.py) ... [?25l[?25hdone
  Created wheel for img

In [None]:
import hanlp
tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
ner = hanlp.load(hanlp.pretrained.ner.MSRA_NER_ELECTRA_SMALL_ZH)

import regex as re
import difflib
from itertools import product
import json
import jieba

def extract_location(sentence):
    nerred_sent = ner(tok(sentence))
    # 提取命名实体是ORGANIZATION或者LOCATION，且结尾不是市镇区县的词
    locations = [l[0] for l in nerred_sent if (l[1]=='ORGANIZATION' or l[1]=='LOCATION') and len(re.findall('[市镇区县]$', l[0]))==0]

    return locations

def extract_problem(text):
  # 删掉标点符号
  text = re.sub(r'\p{P}+', '', text) 
  # 删掉空格、换行符、信息保密、信息不保密、距离、时间、数字、投诉人信息
  text = re.sub(r'(\s|\n|信息[不]?保密|\d+[m米年月日]?|(.{1}先生|.{1}女士|市民)(来电)?反映)', '', text) 

  if '问题描述' in text and '诉求目的' in text:
    text = re.search(r'(?<=问题描述).*(?=诉求目的)', text).group()

  # 删掉停用词
  with open(f'{stopwords_path}/cn_stopwords.txt') as f:
    stopwords = f.read().splitlines() 

  seg_list = jieba.cut(text)

  result = ''.join([s for s in seg_list if s not in stopwords])
  return result

def extract_loupan(text):
  loupan_list = []
  address_list = []

  with open(f'{raw_data_path}/新楼盘.json') as json_file:
    loupan = json.load(json_file)

  with open(f'{raw_data_path}/旧楼盘.json') as json_file:
    old_loupan = json.load(json_file)

  loupan.update(old_loupan)

  for lp, addr in loupan.items():
    if lp in text:
      loupan_list.append(lp)
      address_list.append(addr)

  return loupan_list, address_list

def get_similar_ratio(list_1, list_2):
    unique_combinations = list(product(list_1, list_2))
    similar_ratios = [difflib.SequenceMatcher(None, l[0], l[1]).ratio() for l in unique_combinations]
    count = sum(map(lambda x : x>.8, similar_ratios))
    return count

def find_similar_pairs(row1, row2):
    is_similar = False
    # 如果线索内容完全相同，则pass
    if row1['XSNR'] == row2['XSNR']:
        pass
    # 如果线索内容包含相同楼盘，或者相同的location个数大于2，或者相似的location个数大于3，或者location与楼盘地址相似个数大于2，则认为可能是重复线索
    elif len(set(row1['loupan']).intersection(set(row2['loupan']))) > 0 \
    or len(set(row1['location']).intersection(set(row2['location']))) > 2 \
    or get_similar_ratio(row1['location'], row2['location']) > 3 \
    or get_similar_ratio(row1['location'], row2['loupan_address']) > 2:
        is_similar = True
    
    if is_similar == True:
        return [row1['problem'], row2['problem']]
    else:
        return None



In [None]:
from collections import OrderedDict
xzqh_cnt = {}

for i in df['XZQH'].unique():
  xzqh_cnt[i] = len(df[df['XZQH']==i])

xzqh_cnt = OrderedDict(sorted(xzqh_cnt.items(), key = lambda x: x[1], reverse = False))
xzqh_cnt

OrderedDict([(nan, 0),
             ('宝应湖农场', 5),
             ('清江浦区淮安现代商务集聚区管委会', 18),
             ('金港路街道', 20),
             ('清江浦区清河经济开发区管委会', 23),
             ('金湖经济开发区', 29),
             ('岔庙镇', 33),
             ('老子山镇', 34),
             ('大东镇', 37),
             ('高铁商务区街道', 39),
             ('南集镇', 40),
             ('唐集镇', 43),
             ('金南镇', 48),
             ('前锋镇', 49),
             ('石湖镇', 49),
             ('清江浦区工业园', 53),
             ('黄营镇', 53),
             ('陈师街道', 53),
             ('吕良镇', 55),
             ('成集镇', 56),
             ('东胡集镇', 56),
             ('戴楼街道', 58),
             ('淮洪路街道', 62),
             ('流均镇', 64),
             ('洪泽经济开发区', 67),
             ('三河镇', 68),
             ('宁连路街道', 69),
             ('金北街道', 73),
             ('梁岔镇', 75),
             ('桂五镇', 77),
             ('张码街道', 77),
             ('岔河镇', 78),
             ('五港镇', 80),
             ('保滩街道', 80),
             ('顺河镇', 91),
             ('范集镇', 92),
             

In [None]:
df.head()

Unnamed: 0,BH,XSBH,WRLX,DSMC,XZQH,XSNR
0,004b0f4e1ca7a031db791fb17f30af77,201907030015,,淮安市,淮阴区,淮阴区丁集镇镇北村8支桥下二大沟属区级管护沟，且从上游口流下大量的生活垃圾及水花生腾堵在桥下...
1,005042ac1dc448d43b4189b0bc6ca250,201908090037,大气污染,淮安市,新港街道,青年西路新港义务制学校工地内工房前裸土未覆盖
2,0050a0aeb4e0cdcbf462440c731e3721,202011240020,水污染,淮安市,市工业园区,淮安市朗坤污水处理有限公司（淮安市第三污水处理有限公司）\tPH仪器已安装联网。存在问题：废...
3,0050ee5e1847655e1a0865bb291b98ea,201911210010,大气污染,淮安市,生态文旅区,万康路与星辰路交叉口东南100m黄土露天堆放未覆盖，生活垃圾露天堆放未覆盖。
4,005494bc2c1bd85cab5da437bbebee50,202005020091,固废污染,淮安市,淮阴区,经巡查，发现当事人周雷驾驶渣土车在淮阴区回春路沿途泄露、抛洒。


In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

df_sample = df.copy(deep=True)
# 投诉问题
df_sample['problem'] = df_sample['XSNR'].progress_apply(extract_problem)

  0%|          | 0/71849 [00:00<?, ?it/s]

In [None]:
# 楼盘
df_sample['loupan'], df_sample['loupan_address'] = zip(*df_sample['problem'].progress_apply(extract_loupan))

  0%|          | 0/71849 [00:00<?, ?it/s]

In [None]:
# 地址
df_sample['location'] = df_sample['problem'].progress_apply(extract_location)

  0%|          | 0/71849 [00:00<?, ?it/s]

In [None]:
df_sample.to_excel(f'{unlabeled_data_path}/clean_data.xlsx', index=False)

In [None]:
# 包含相似或相同命名实体（LOCATION/ORGANIZATION)或楼盘名称
from openpyxl import load_workbook

for i in list(xzqh_cnt.items())[1:]:
  sentence_pairs = []
  df_i = df_sample[df_sample['XZQH']==i[0]]
  for idx1, row1 in tqdm(df_i.iterrows(), total=len(df_i), desc=i[0]):
      for idx2, row2 in df_i.iterrows():
          result = find_similar_pairs(row1, row2)
          if idx1!=idx2 and result is not None: 
              sentence_pairs.append([row1['problem'], row2['problem']])
  
  if sentence_pairs:
    with pd.ExcelWriter(f'{unlabeled_data_path}/ner.xlsx', mode="a", engine="openpyxl", if_sheet_exists='replace') as writer:
        df_s = pd.DataFrame(sentence_pairs)
        df_s.to_excel(writer, sheet_name=i[0]) 

宝应湖农场:   0%|          | 0/5 [00:00<?, ?it/s]

清江浦区淮安现代商务集聚区管委会:   0%|          | 0/18 [00:00<?, ?it/s]

金港路街道:   0%|          | 0/20 [00:00<?, ?it/s]

清江浦区清河经济开发区管委会:   0%|          | 0/23 [00:00<?, ?it/s]

金湖经济开发区:   0%|          | 0/29 [00:00<?, ?it/s]

岔庙镇:   0%|          | 0/33 [00:00<?, ?it/s]

老子山镇:   0%|          | 0/34 [00:00<?, ?it/s]

大东镇:   0%|          | 0/37 [00:00<?, ?it/s]

高铁商务区街道:   0%|          | 0/39 [00:00<?, ?it/s]

南集镇:   0%|          | 0/40 [00:00<?, ?it/s]

唐集镇:   0%|          | 0/43 [00:00<?, ?it/s]

金南镇:   0%|          | 0/48 [00:00<?, ?it/s]

前锋镇:   0%|          | 0/49 [00:00<?, ?it/s]

石湖镇:   0%|          | 0/49 [00:00<?, ?it/s]

清江浦区工业园:   0%|          | 0/53 [00:00<?, ?it/s]

黄营镇:   0%|          | 0/53 [00:00<?, ?it/s]

陈师街道:   0%|          | 0/53 [00:00<?, ?it/s]

吕良镇:   0%|          | 0/55 [00:00<?, ?it/s]

成集镇:   0%|          | 0/56 [00:00<?, ?it/s]

东胡集镇:   0%|          | 0/56 [00:00<?, ?it/s]

戴楼街道:   0%|          | 0/58 [00:00<?, ?it/s]

淮洪路街道:   0%|          | 0/62 [00:00<?, ?it/s]

流均镇:   0%|          | 0/64 [00:00<?, ?it/s]

洪泽经济开发区:   0%|          | 0/67 [00:00<?, ?it/s]

三河镇:   0%|          | 0/68 [00:00<?, ?it/s]

宁连路街道:   0%|          | 0/69 [00:00<?, ?it/s]

金北街道:   0%|          | 0/73 [00:00<?, ?it/s]

梁岔镇:   0%|          | 0/75 [00:00<?, ?it/s]

桂五镇:   0%|          | 0/77 [00:00<?, ?it/s]

张码街道:   0%|          | 0/77 [00:00<?, ?it/s]

岔河镇:   0%|          | 0/78 [00:00<?, ?it/s]

五港镇:   0%|          | 0/80 [00:00<?, ?it/s]

保滩街道:   0%|          | 0/80 [00:00<?, ?it/s]

顺河镇:   0%|          | 0/91 [00:00<?, ?it/s]

范集镇:   0%|          | 0/92 [00:00<?, ?it/s]

鲍集镇:   0%|          | 0/98 [00:00<?, ?it/s]

官滩镇:   0%|          | 0/99 [00:00<?, ?it/s]

天泉湖镇:   0%|          | 0/100 [00:00<?, ?it/s]

富城路街道:   0%|          | 0/113 [00:00<?, ?it/s]

黄花塘镇:   0%|          | 0/115 [00:00<?, ?it/s]

淮河镇:   0%|          | 0/115 [00:00<?, ?it/s]

红窑镇:   0%|          | 0/116 [00:00<?, ?it/s]

塔集镇:   0%|          | 0/119 [00:00<?, ?it/s]

施河镇:   0%|          | 0/120 [00:00<?, ?it/s]

古桑街道:   0%|          | 0/125 [00:00<?, ?it/s]

朱坝街道:   0%|          | 0/125 [00:00<?, ?it/s]

清河街道:   0%|          | 0/126 [00:00<?, ?it/s]

管仲镇:   0%|          | 0/127 [00:00<?, ?it/s]

河桥镇:   0%|          | 0/133 [00:00<?, ?it/s]

Exception ignored in: <function tqdm.__del__ at 0x7f343b2d2290>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/tqdm/std.py", line 1162, in __del__
    self.close()
  File "/usr/local/lib/python3.7/dist-packages/tqdm/notebook.py", line 286, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm' object has no attribute 'disp'


银涂镇:   0%|          | 0/135 [00:00<?, ?it/s]

盐河街道:   0%|          | 0/143 [00:00<?, ?it/s]

复兴镇:   0%|          | 0/151 [00:00<?, ?it/s]

博里镇:   0%|          | 0/156 [00:00<?, ?it/s]

穆店镇:   0%|          | 0/157 [00:00<?, ?it/s]

南马厂街道:   0%|          | 0/158 [00:00<?, ?it/s]

朱桥镇:   0%|          | 0/165 [00:00<?, ?it/s]

平桥镇:   0%|          | 0/167 [00:00<?, ?it/s]

朱码街道:   0%|          | 0/167 [00:00<?, ?it/s]

黄集街道:   0%|          | 0/173 [00:00<?, ?it/s]

浦楼街道:   0%|          | 0/176 [00:00<?, ?it/s]

漕运镇:   0%|          | 0/177 [00:00<?, ?it/s]

苏嘴镇:   0%|          | 0/181 [00:00<?, ?it/s]

徐溜镇:   0%|          | 0/183 [00:00<?, ?it/s]

福地路街道:   0%|          | 0/196 [00:00<?, ?it/s]

三树镇:   0%|          | 0/202 [00:00<?, ?it/s]

清江街道:   0%|          | 0/205 [00:00<?, ?it/s]

黄码镇:   0%|          | 0/206 [00:00<?, ?it/s]

新港街道:   0%|          | 0/221 [00:00<?, ?it/s]

东双沟镇:   0%|          | 0/225 [00:00<?, ?it/s]

新渡口街道:   0%|          | 0/227 [00:00<?, ?it/s]

渔沟镇:   0%|          | 0/234 [00:00<?, ?it/s]

太和街道:   0%|          | 0/239 [00:00<?, ?it/s]

车桥镇:   0%|          | 0/242 [00:00<?, ?it/s]

石塘镇:   0%|          | 0/244 [00:00<?, ?it/s]

闸口街道:   0%|          | 0/255 [00:00<?, ?it/s]

高沟镇:   0%|          | 0/258 [00:00<?, ?it/s]

长东街道:   0%|          | 0/276 [00:00<?, ?it/s]

马坝镇:   0%|          | 0/285 [00:00<?, ?it/s]

刘老庄乡:   0%|          | 0/288 [00:00<?, ?it/s]

山阳街道:   0%|          | 0/288 [00:00<?, ?it/s]

古清口街道:   0%|          | 0/289 [00:00<?, ?it/s]

钦工镇:   0%|          | 0/295 [00:00<?, ?it/s]

南陈集镇:   0%|          | 0/298 [00:00<?, ?it/s]

长西街道:   0%|          | 0/333 [00:00<?, ?it/s]

马头镇:   0%|          | 0/356 [00:00<?, ?it/s]

淮高镇:   0%|          | 0/391 [00:00<?, ?it/s]

高家堰镇:   0%|          | 0/393 [00:00<?, ?it/s]

徐杨街道:   0%|          | 0/425 [00:00<?, ?it/s]

枚乘街道:   0%|          | 0/429 [00:00<?, ?it/s]

黎城街道:   0%|          | 0/429 [00:00<?, ?it/s]

高良涧街道:   0%|          | 0/449 [00:00<?, ?it/s]

城南街道:   0%|          | 0/455 [00:00<?, ?it/s]

淮海街道:   0%|          | 0/456 [00:00<?, ?it/s]

府前街道:   0%|          | 0/465 [00:00<?, ?it/s]

水渡口街道:   0%|          | 0/467 [00:00<?, ?it/s]

丁集镇:   0%|          | 0/486 [00:00<?, ?it/s]

河下街道:   0%|          | 0/558 [00:00<?, ?it/s]

柳树湾街道:   0%|          | 0/575 [00:00<?, ?it/s]

盱城街道:   0%|          | 0/587 [00:00<?, ?it/s]

淮城街道:   0%|          | 0/603 [00:00<?, ?it/s]

涟城街道:   0%|          | 0/734 [00:00<?, ?it/s]

钵池街道:   0%|          | 0/741 [00:00<?, ?it/s]

王家营街道:   0%|          | 0/749 [00:00<?, ?it/s]

长江路街道:   0%|          | 0/763 [00:00<?, ?it/s]

市工业园区:   0%|          | 0/787 [00:00<?, ?it/s]

清浦街道:   0%|          | 0/800 [00:00<?, ?it/s]

生态文旅区:   0%|          | 0/1155 [00:00<?, ?it/s]

淮安市:   0%|          | 0/3228 [00:00<?, ?it/s]

涟水县:   0%|          | 0/3741 [00:00<?, ?it/s]

洪泽区:   0%|          | 0/3773 [00:00<?, ?it/s]

金湖县:   0%|          | 0/3934 [00:00<?, ?it/s]

经济技术开发区:   0%|          | 0/4277 [00:00<?, ?it/s]

淮安区:   0%|          | 0/5893 [00:00<?, ?it/s]

淮阴区:   0%|          | 0/6841 [00:00<?, ?it/s]

盱眙县:   0%|          | 0/7067 [00:00<?, ?it/s]

清江浦区:   0%|          | 0/8711 [00:00<?, ?it/s]

In [None]:
# sentence transformers余弦相似度相似
from sentence_transformers import SentenceTransformer, util
model_noft = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

for i in xzqh_cnt:
    print(i)
    sentences = df_sample[df_sample['XZQH']==i]['problem'].tolist()
    paraphrases = util.paraphrase_mining(model_noft, sentences, show_progress_bar=True, max_pairs=5000, top_k=50)

    sentence_pairs = []
    for paraphrase in paraphrases:
        score, i, j = paraphrase
        if score < .99:
          sentence_pairs.append([score, sentences[i], sentences[j]])

    #with pd.ExcelWriter(f'{unlabeled_data_path}/sbert.xlsx', mode="w", engine="openpyxl") as writer:
    with pd.ExcelWriter(f'{unlabeled_data_path}/sbert.xlsx', mode="a", engine="openpyxl", if_sheet_exists='new') as writer:
        df_s = pd.DataFrame(sentence_pairs)
        df_s.to_excel(writer, sheet_name=i) 