In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **資料蒐集**

In [None]:
issue_list = [
                '妨害風化',
                '恐嚇危害安全',
                '公然侮辱',
                '誹謗',
                '妨害秘密',
                '妨害電腦使用'
]

In [None]:
import time
import random
import requests
import pandas as pd

HEADERS = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36',
}

def get_case(issue, page1, page2):
  link_list = []
  for page in range(page1, page2):
    url = f"https://www.lawplus.com.tw/rest/search/report?querySentence=網路&keyword=網路&prevKeyword=網路&date=&money=&sentence=&caseNum=&caseTypes=&courts=&levels=&jtypes=&tags=&issue={issue}&main=&judge=&judgeTypes=&lawyer=&litigant=&prosecutor=&clerk=&rows=10&page={page+1}&sortField=&_=1631628688160"
        
    r = requests.get(url, headers=HEADERS)
    base_data = r.json()
    for each in base_data['rows']:
      link_list.append('https://www.lawplus.com.tw/rest/search/report/' + each['identifier'])
      
    time.sleep(1)

  case_list = []
  for link in link_list:
    case_r = requests.get(link, headers=HEADERS)
    case_data = case_r.json()
    case_list.append({
      'caseNum': case_data['response']['reportBase']['caseNum'],  #案號
      'court': case_data['response']['reportBase']['courtCode'],  #法院
      'issue': issue,
      'content': case_data['response']['reportBase']['content'],
      'defendant': case_data['response']['report']['defendant'],  #被告
      'main': case_data['response']['report']['main'],  #主文
      'level': case_data['response']['report']['level'],  #審級
      'previousCaseNum': case_data['response']['report']['previousCaseNum']  #前審案號
    })

    time.sleep(1)
  
  return case_list


def crawler():
  judgement = []

  for issue in issue_list:
    psize_url = f"https://www.lawplus.com.tw/rest/search/report?querySentence=網路&keyword=網路&prevKeyword=網路&date=&money=&sentence=&caseNum=&caseTypes=&courts=&levels=&jtypes=&tags=&issue={issue}&main=&judge=&judgeTypes=&lawyer=&litigant=&prosecutor=&clerk=&rows=10&page=1&sortField=&_=1631628688160"  #加上全文檢索「網路」以縮小搜尋範圍
    psize_r = requests.get(psize_url, headers=HEADERS)
    psize_data = psize_r.json()
    psize = psize_data['total']
    
    for row in range((psize//100)+1):
      if row > (psize//100):
        case_list = get_case(issue, (row*100), psize)
      else:
        case_list = get_case(issue, (row*100), (row*100+100))
  
      judgement.extend(case_list)
      time.sleep(1)

  df = pd.DataFrame(judgement)
  df.to_csv('/content/drive/MyDrive/legal_predictor/data/judgement.csv')

In [None]:
#get_case('妨害風化', 0, 2)

crawler()

# **資料前處理**

In [None]:
import pandas as pd

raw = pd.read_csv('/content/drive/MyDrive/legal_predictor/data/judgement.csv', index_col=0)

raw

In [None]:
import re

def rm_space(data):
  a = re.compile(r'\n|&nbsp|\xa0|\\xa0|\u3000|\\u3000|\u0020|\\u0020|\t|\r')
  data = a.sub('', data)
  return data


def clean_cont(data):  # 裁判書全文處理 (content)
  cont = []

  for content in data['content']:
    fact = re.search(r'\n\s+[\u2e80-\u9fff]{0,2}(\u4e8b\s*\u5be6|\u7406\s*\u7531)[\s|\S]*\u4e2d\s+\u83ef\s+\u6c11\s+\u570b', content)
    try:
      rms_cont = rm_space(fact.group())
    except:
      rms_cont = rm_space(content)

    try: 
      rms_cont1 = rms_cont.split('一、')[1]
      rms_cont2 = rms_cont1.split('二、')[0]
    except:
      rms_cont2 = rms_cont

    if ('引用' in rms_cont2 and '附件' in rms_cont2) or '如附件' in rms_cont2:
      try: 
        cont1 = content.split('附件：')[1]
        cont2 = cont1.split('一、')[1]
        cont3 = cont2.split('二、')[0]
      except:
        pass
    elif '檢察官聲請' in rms_cont2:
      cont3 = ''
    else:
      cont3 = rms_cont2
    
    cont.append(cont3)

  data['content_clean'] = cont
  

  return data

#clean_cont(raw)

In [None]:
def clean_all(raw):
  raw1 = raw.drop(raw[raw['main'].isin(['上訴駁回。'])].index)  # 刪除上訴駁回案件
  raw2 = raw1.dropna(subset=['main']) # 刪除主文為空值的資料
  raw3 = raw2[~raw2['main'].str.contains('不受理。')]  # 刪除不受理案件
  raw3.reset_index(drop=True, inplace=True)

  main_clean = []  # 處理主文空格
  lawsuit = []  # 標記被告是否有罪，有罪為1，無罪為0
  for main in raw3['main']:
    main_c = rm_space(main)
    main_clean.append(main_c)
    if '無罪' in main:
      lawsuit.append(0)
    else:
      lawsuit.append(1)

  raw3['main_clean'] = main_clean
  raw3['lawsuit'] = lawsuit

  clean = clean_cont(raw3)

  return clean


#clean_all(raw)

In [None]:
def save_process(raw):
  raw = clean_all(raw)
  df = pd.DataFrame({
      'text': raw['content_clean'],
      'label': raw['lawsuit']
  })
      
  df.drop(df.loc[df['text']==''].index, inplace=True)
  df.reset_index(drop=True, inplace=True)

  row=0
  for data in df['text']:
    if len(data) < 25:
      df = df.drop(index=[row])
    row += 1

  df1 = df.drop_duplicates(subset=['text'])
  df1.reset_index(drop=True, inplace=True)

  df1.to_csv('/content/drive/MyDrive/legal_predictor/data/process.csv', encoding='utf-8-sig')
  #return len(df)

In [None]:
save_process(raw)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# **模型**

https://reurl.cc/yeLp2O

In [None]:
%pip install transformers

In [None]:
import numpy as np
import pandas as pd
import torch
import joblib
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('process.csv', index_col=0)
# df = data[:1000]

Downloading...
From: https://drive.google.com/uc?id=1-A0KVzHjng55QlBFgzeqC-1ArBE4RnJm
To: /content/process.csv
  0% 0.00/4.85M [00:00<?, ?B/s]100% 4.85M/4.85M [00:00<00:00, 75.8MB/s]


In [None]:
### DistilBERT
# model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

### BERT
# model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# import transformers as ppb  # pytorch transformers
# Load pretrained model/tokenizer
# tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
# model = model_class.from_pretrained(pretrained_weights)

###################################
###################################

# ckip繁中
from transformers import (
   BertTokenizerFast,
   AutoModelForMaskedLM,
   AutoModelForCausalLM,
   AutoModelForTokenClassification,
)

# masked language model (ALBERT, BERT)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
model = AutoModelForMaskedLM.from_pretrained('ckiplab/albert-tiny-chinese')

In [None]:
tokenized = df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=)))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### Padding

In [None]:
max_len = 0
for i in tokenized:
   if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized])
np.array(padded).shape

(1000, 50)

### Masking

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1000, 50)

### Model : And Now, Deep Learning!

In [None]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
# Slice the output for the first position for all the sequences, take all hidden unit outputs
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = df['label']

In [None]:
np.save('/content/drive/My Drive/legal_predictor/model/feature_test', features)
labels.to_csv('/content/drive/My Drive/legal_predictor/model/labels_test', encoding="utf-8-sig")

# **打包**

In [None]:
%pip install transformers

Collecting transformers
  Downloading transformers-4.12.2-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.2 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 67.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 68.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 21.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  At

In [None]:
def load_file():
  import numpy as np
  import pandas as pd

  features = np.load('feature_test.npy')
  labels = pd.read_csv('labels_test', index_col=0)

  return features, labels


def lawsuit_predict(input_text):
  input = [input_text]

  import numpy as np
  import pandas as pd
  import torch
  from sklearn.linear_model import LogisticRegression

  from transformers import (
   BertTokenizerFast,
   AutoModelForMaskedLM,
   AutoModelForCausalLM,
   AutoModelForTokenClassification,
  )

  tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
  model = AutoModelForMaskedLM.from_pretrained('ckiplab/albert-tiny-chinese')

  features, labels = load_file()

  test_df = pd.DataFrame({ 'text': input })
  tokenized = test_df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=50)))

  max_len = 0
  for i in tokenized:
    if len(i) > max_len:
          max_len = len(i)

  padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized])

  attention_mask = np.where(padded != 0, 1, 0)
  attention_mask.shape

  input_ids = torch.tensor(padded)
  attention_mask = torch.tensor(attention_mask)

  with torch.no_grad():
      last_hidden_states = model(input_ids, attention_mask=attention_mask)

  input_feature = last_hidden_states[0][:,0,:].numpy()

  lr_clf = LogisticRegression()
  lr_clf.fit(features, labels)

  result = lr_clf.predict_proba(input_feature)

  return result[0][0], result[0][1]  #無罪率,有罪率 

In [None]:
lawsuit_predict('民國110年11月1日，被告在臉書平台公開社團「小可愛」，公然貼文「你是不是有問題」等毀損他人名譽文字並標記甲，使甲感到不適')

Downloading...
From: https://drive.google.com/uc?id=1-35qrkycCUtk0nXy1cP4GpqKL7vy_FvQ
To: /content/feature_test.npy
100% 84.5M/84.5M [00:00<00:00, 182MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-7LPSBU38ISrptSG9JkvRqnTvnaJn_g4
To: /content/labels_test
100% 5.90k/5.90k [00:00<00:00, 9.44MB/s]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(0.5068488628460055, 0.49315113715399445)