-
Notifications
You must be signed in to change notification settings - Fork 0
/
segmentation.py
128 lines (113 loc) · 5.38 KB
/
segmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import csv
from pyhanlp import *
import pickle
import pandas as pd
import time
from jpype import *
import pickle
filename = 'OGI2016_noduplicate_winloss.csv'
#filename = '/Users/wuxiaohan/Desktop/UCSC_database_project/data/test_data.csv'
pos_list = ['an','g','gb','gbc','gc','gg','gi','gm','gp','i','j','l','n','nb',\
'nba','nbc','nbp','nf','ng','nh','nhd','nhm','ni','nic','nis','nit'\
'nl','nm','nmc','nn','nnd','nnt','nt','ntc','nto','nz','vn']
stopword = ['原告','被告','汉族','判决书','判决','法院','当事人','审判长','审判员',\
'人民陪审员','代理审判员','书记员','对方','当事人','人数','副本','案件'\
'被告方','代表人','终字','原被告','。案','上诉人','确认','法>']
##read file
texts = pd.read_csv(filename)
ids = list(texts['id'])
category = list(texts['category'])
facts = list(texts['facts'])
# with open(filename) as csvfile:
# reader = csv.reader(csvfile)
# next(reader)
# ids=[]
# facts = []
# for item in reader:
# ids.append(item[0])
# facts.append(item[19])
# csvfile.close()
##remove spaces tab \n
def cleaning(files):
for i in range(len(files)):
files[i] = files[i].replace(' ','').replace(' ','').replace('\n','')
return files
facts = cleaning(facts)
print('cleaning done')
# """ 演示用户词典的动态增删
# TO-DO:
# DoubleArrayTrie分词
# 首字哈希之后二分的trie树分词
# >>> text = "攻城狮逆袭单身狗,迎娶白富美,走上人生巅峰"
# >>> demo_custom_dictionary(text)
# [攻城/vi, 狮/ng, 逆袭/nz, 单身/n, 狗/n, ,/w, 迎娶/v, 白富美/nr, ,/w, 走上/v, 人生/n, 巅峰/n]
# [攻城狮/nz, 逆袭/nz, 单身狗/nz, ,/w, 迎娶/v, 白富美/nz, ,/w, 走上/v, 人生/n, 巅峰/n]
# """
#print(HanLP.segment(text))
# CustomDictionary.insert(newphrase, "nz 1024") # 动态增加
#CustomDictionary.insert("白富美", "nz 1024") # 强行插入
#CustomDictionary.remove("攻城狮") # 删除词语(注释掉试试)
#CustomDictionary.add("单身狗", "nz 1024 n 1")
#CustomDictionary.remove("单身狗") # 删除词语(注释掉试试)
#CustomDictionary.remove("白富美") # 删除词语(注释掉试试)
#print(CustomDictionary.get("单身狗"))
# CustomDictionary.insert("审判委员会", "nz 1024") #其他名词
# """ NLP分词,更精准的中文分词、词性标注与命名实体识别
# 标注集请查阅 https://github.com/hankcs/HanLP/blob/master/data/dictionary/other/TagPKU98.csv
# 或者干脆调用 Sentence#translateLabels() 转为中文
# #>>> demo_NLP_segment()
# [我/r, 新造/v, 一个/m, 词/n, 叫/v, 幻想乡/ns, 你/r, 能/v, 识别/v, 并/c, 正确/ad, 标注/v, 词性/n, 吗/y, ?/w]
# 我/代词 的/助词 希望/名词 是/动词 希望/动词 张晚霞/人名 的/助词 背影/名词 被/介词 晚霞/名词 映/动词 红/形容词
# 支援/v 臺灣/ns 正體/n 香港/ns 繁體/n :/w [微软/nt 公司/n]/nt 於/p 1975年/t 由/p 比爾·蓋茲/n 和/c 保羅·艾倫/nr 創立/v 。/w
# """
# print(NLPTokenizer.segment("我新造一个词叫幻想乡你能识别并正确标注词性吗?")) # “正确”是副形词。
# 注意观察下面两个“希望”的词性、两个“晚霞”的词性
# print(NLPTokenizer.analyze("我的希望是希望张晚霞的背影被晚霞映红").translateLabels())
# print(NLPTokenizer.analyze("支援臺灣正體香港繁體:微软公司於1975年由比爾·蓋茲和保羅·艾倫創立。"))
start = time.clock()
# Run segmentation in JVM
startJVM(getDefaultJVMPath(), "-ea", "-Djava.class.path=/anaconda3/lib/python3.7/site-packages/pyhanlp/static/hanlp-portable-1.6.0.jar")
NLPTokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
segmentedText = []
segmentedPOS = []
for i in range(len(facts)):
# for i in range(10):
textSegments = []
posSegments = []
for segment in NLPTokenizer.segment(facts[i]):
# split on the last occurence of '/'
word, pos = str(segment).rsplit('/',1)
textSegments.append(word)
posSegments.append(pos)
segmentedText.append(textSegments)
segmentedPOS.append(posSegments)
if i%100 == 0:
print(i,'files done, time used:{}'.format(time.clock()-start))
print('segmentation done')
shutdownJVM()
# Shutdown JVM
d = {'id': ids, 'segmentedText': segmentedText, 'segmentedPOS':segmentedPOS, 'category':category}
out_list = pd.DataFrame(data=d)
out_list.to_pickle('All_OGI_segmented_facts.pickle')
# out_list.to_csv('All_OGI_segmented_facts.csv')
# out_list = []
# for item in segmented:
# noun_list = []
# for file in item:
# if str(file.nature) in pos_list and len(file.word) > 1 and file.word not in stopword:
# noun_list.append(file.word)
# out_list.append(noun_list)
# print('Selected nouns, remove stopwords')
#
# all_stems = sum(out_list, [])
# stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) < 5)
# texts = [[stem for stem in text if stem not in stems_once] for text in out_list]
# print(texts[0])
# filehandler = open("/Users/wuxiaohan/Desktop/2019SpringRaWork/All_OGI/All_OGI_segmented","wb")
# pickle.dump(out_list,filehandler)
# filehandler.close()
# test=''
# for item in res:
# test = test+item
# print(HanLP.extractPhrase(test,20))
# print(CustomDictionary.get("审判委员会"))