In [138]:
import gensim
from gensim.models import word2vec
from gensim.similarities import WmdSimilarity

import csv
import numpy as np
import pandas as pd
import jieba
import Levenshtein

In [165]:
csv_file = 'data/power-facts.csv'
case_id = '44bc15c0-83b6-44de-8017-0d682537c63a'
df = readData(csv_file)
topn = 10

In [151]:
def readData(csv_file):
    '''
    Inputs:
        csv_file: csv文件路徑
    Outputs:
        df: 案例集合的dataframe格式
    '''
    df = pd.read_csv(csv_file)
    return df

In [152]:
def segWithDict(case_list):
    userdict = 'data/userdict.txt'
    stopwords = [line.strip() for line in open('data/stopwords.txt', 'r', encoding='utf-8').readlines()]
    stopwords2 = ['\n','\r','\r\n']

    fact_list = []
    tmp = []
    
    for i, cases in enumerate(case_list):
        seg = jieba.cut(cases[4], cut_all=False, HMM=True)
        seg_item = ' '.join(seg)
        tmp.append(seg_item)
        fact_list.append(tmp[i].split(' '))

    corpus = [[]for i in range(len(fact_list))] # 此為 word_list_sep[i] == ['w1','w2', ...] list格式
    
    jieba.load_userdict(userdict)
    
    for i in range(len(fact_list)):
        for j, word in enumerate(fact_list[i]):
            if word not in stopwords and word not in stopwords2:
                corpus[i].append(word)
    return corpus


In [1]:
def getCaseList(case_id,counts, case_list, df1):
    # print('case_list[0] : ', case_list[0])
    corpus = segWithDict(case_list)
    model = word2vec.Word2Vec.load('models/low_1000.model')
    instance = WmdSimilarity(corpus, model, counts)
    idx = [cases[0] for cases in case_list].index(case_id)
#     print('原案件：')
#     print(case_list[idx][2])
#     print(case_list[idx][3])
#     print(case_list[idx][4])
    query = corpus[idx]
#     print('分詞後的 query:', query)

    sims = instance[query]
#     for sim in sims:
#         print(sim)
    result_list = []
    
    for sim in sims:
#         print('###########################')
#         print(sim[1]) # similarity
#         print(case_list[sim[0]][3]) # name 
#         print(case_list[sim[0]][4]) # illega_facts
        tmp = []
        tmp.append(case_list[sim[0]][0]) # case_id
        tmp.append(case_list[sim[0]][4])
        result_list.append(tmp)
        
    return result_list

In [154]:
'''示例：   
    r_df=by_jaro_winkler('c0814c02-0504-46fe-ac8c-e6087d4cd01f')
    print(r_df)
'''
def  recommender_by_id(case_id,df1):
    r_df=pd.DataFrame()
    recommander=dict()

    df_tmp = df[(df.CASE_ID==case_id)]
    target_line=df_tmp.ILLEGAL_FACTS.to_string(index=False).strip()
    for index,row in df1.iterrows():
        sim=Levenshtein.jaro_winkler(target_line, row.values[4].strip())
        recommander[row.values[0]]=sim
    for i,pre in enumerate(sorted(recommander.items(),key=lambda x:x[1],reverse=True)):
#     for i,pre in enumerate(sorted(recommander.items(),key=lambda x:x[1],reverse=True)[0:topn]):
        index=df[df['CASE_ID']==pre[0]].index.values[0]
#         待推薦案例本身也是類案
#         if case_id==pre[0]:
#             pass
#             continue 
        r_df_len=r_df.shape[0]
        r_df=r_df.append(df.iloc[index,[0,4]])
    r_df.columns=['CASE_ID', 'ILLEGAL_FACTS']
    return r_df

def by_jaro_winkler(case_id):
    power_id=df[df['CASE_ID'] == case_id].POWER_ID.to_string(index=False).strip()
    df1=df[df['POWER_ID'] == power_id]
    df_len=df1.shape[0]
    if df_len>=20:
        r_df=recommender_by_id(case_id,df1)
    else:
        print("案件数量太少，无法推荐")
        r_df=None
    return r_df

In [173]:
jaroDf = by_jaro_winkler(case_id)
jaroDf.reset_index(level=0, inplace=True)
jaroDf.drop(['index'], axis=1, inplace=True)
jaroDf['SCORE1'] = ''
for i in range(len(jaroDf)):
    score = 200-i*10
    if score >= 0:
        jaroDf.loc[i, 'SCORE1'] = score
    else:
        jaroDf.loc[i, 'SCORE1'] = 0
# print(len(jaroDf))
jaroDf

Unnamed: 0,CASE_ID,ILLEGAL_FACTS,SCORE1
0,0f1812dd-fb6a-4ddb-bfe9-34fe03351cfc,在京密引水渠温泉倒虹吸下游左岸20米处水域垂钓,200
1,ec74500e-f252-4f61-9b1c-408582324bff,在京密引水渠温泉倒虹吸下游右岸300米处水域垂钓,190
2,b4227d87-ac64-496e-9881-3fbb1639c3eb,在京密引水渠温泉倒虹吸下游左岸300米处水域垂钓,180
3,3463cea2-169a-42e8-9c11-899e9d213cfc,在京密引水渠温泉倒虹吸下游左岸300米处水域捕鱼,170
4,44bc15c0-83b6-44de-8017-0d682537c63a,在京密引水渠温泉倒虹吸下游左岸300米处水域捕鱼,160
5,19c405d6-d889-4b17-95d2-52dc2de73aae,在京密引水渠温泉倒虹吸下游左岸500米处水域游泳,150
6,5edb4506-05ee-48e7-9984-52563224208a,在京密引水渠温泉倒虹吸下游右岸600米处水域垂钓,140
7,e33b075e-75fd-4990-89c1-3a07c6c73bc8,在京密引水渠温泉倒虹吸下游右岸600米处水域垂钓,130
8,e27c2c3f-e53f-42b9-b22b-13e0c07fc4ff,在京密引水渠温泉倒虹吸下游右岸600米处水域垂钓,120
9,c91f3c96-6f5f-4e28-89a6-466b408be0af,在京密引水渠温泉倒虹吸下游左岸5米处水域游泳,110


In [175]:
df2 = pd.merge(jaroDf, wmDf)
df2['SCORE'] = df2['SCORE1']+df2['SCORE2']
df2['SCORE'] = df2['SCORE']/2
df2.drop(['SCORE1'], axis=1, inplace=True)
df2.drop(['SCORE2'], axis=1, inplace=True)
df2.sort_values('SCORE', ascending=False, inplace=True)
# df2.drop(['SCORE'], axis=1)
df2

Unnamed: 0,CASE_ID,ILLEGAL_FACTS,SCORE
3,3463cea2-169a-42e8-9c11-899e9d213cfc,在京密引水渠温泉倒虹吸下游左岸300米处水域捕鱼,190
0,0f1812dd-fb6a-4ddb-bfe9-34fe03351cfc,在京密引水渠温泉倒虹吸下游左岸20米处水域垂钓,185
1,ec74500e-f252-4f61-9b1c-408582324bff,在京密引水渠温泉倒虹吸下游右岸300米处水域垂钓,185
2,b4227d87-ac64-496e-9881-3fbb1639c3eb,在京密引水渠温泉倒虹吸下游左岸300米处水域垂钓,185
4,44bc15c0-83b6-44de-8017-0d682537c63a,在京密引水渠温泉倒虹吸下游左岸300米处水域捕鱼,180
6,5edb4506-05ee-48e7-9984-52563224208a,在京密引水渠温泉倒虹吸下游右岸600米处水域垂钓,150
5,19c405d6-d889-4b17-95d2-52dc2de73aae,在京密引水渠温泉倒虹吸下游左岸500米处水域游泳,140
7,e33b075e-75fd-4990-89c1-3a07c6c73bc8,在京密引水渠温泉倒虹吸下游右岸600米处水域垂钓,140
8,e27c2c3f-e53f-42b9-b22b-13e0c07fc4ff,在京密引水渠温泉倒虹吸下游右岸600米处水域垂钓,130
9,c91f3c96-6f5f-4e28-89a6-466b408be0af,在京密引水渠温泉倒虹吸下游左岸5米处水域游泳,115


In [177]:
result_list = df2.values.tolist()
for i in range(topn+1):
    print('=======================',i,'===========================')
    print(result_list[i][0])
    print(result_list[i][1], '\n')
    

3463cea2-169a-42e8-9c11-899e9d213cfc
在京密引水渠温泉倒虹吸下游左岸300米处水域捕鱼 

0f1812dd-fb6a-4ddb-bfe9-34fe03351cfc
在京密引水渠温泉倒虹吸下游左岸20米处水域垂钓 

ec74500e-f252-4f61-9b1c-408582324bff
在京密引水渠温泉倒虹吸下游右岸300米处水域垂钓 

b4227d87-ac64-496e-9881-3fbb1639c3eb
在京密引水渠温泉倒虹吸下游左岸300米处水域垂钓 

44bc15c0-83b6-44de-8017-0d682537c63a
在京密引水渠温泉倒虹吸下游左岸300米处水域捕鱼 

5edb4506-05ee-48e7-9984-52563224208a
在京密引水渠温泉倒虹吸下游右岸600米处水域垂钓 

19c405d6-d889-4b17-95d2-52dc2de73aae
在京密引水渠温泉倒虹吸下游左岸500米处水域游泳 

e33b075e-75fd-4990-89c1-3a07c6c73bc8
在京密引水渠温泉倒虹吸下游右岸600米处水域垂钓 

e27c2c3f-e53f-42b9-b22b-13e0c07fc4ff
在京密引水渠温泉倒虹吸下游右岸600米处水域垂钓 

c91f3c96-6f5f-4e28-89a6-466b408be0af
在京密引水渠温泉倒虹吸下游左岸5米处水域游泳 

c8a785a6-c73f-453c-a428-ac6daa8c717f
在京密引水渠温泉倒虹吸下游左岸5米处水域游泳 



In [None]:
def case_recommernder(case_id):
    illegal_fact=df[(df.CASE_ID==case_id)].ILLEGAL_FACTS.to_string(index=False).strip()
    power_id,power_name,case_list，counts,df1=getPowerlist(illegal_fact)
    

In [None]:
from law Predict import LawPredictor
def getPowerlist(illegal_fact):
    test=LawPredictor.PowerClassifier('anjian')
    illegal_facts=df[df['CASE_ID']==case_id].ILLEGAL_FACTS.to_string(index=False).strip()
    power_id,power_name=test.power_predict(fact)
    df1 = df[(df.POWER_ID==power_id)]
    counts = df1.count().tolist()[0]
#     print('同案由下案件數：', counts)
    case_list = df1.values.tolist()
    return power_id,power_name,case_list,counts,df1


In [None]:
def wmn_recommender(case_list):
    wmn_list=getCaseList(case_id,counts, case_list, df1)
    