In [4]:
import preprocess
import pandas as pd
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
train_path = './data/train_splited.csv'
test_path = './data/test_splited.csv'
import os
nrows = None
if not os.path.exists(train_path) or not os.path.exists(test_path):
    # 加载训练集和测试集
    df_tr, df_te = preprocess.load_data(nrows=None)
    
    df_tr['penalty'] = df_tr['penalty'].parallel_apply(lambda x:x-1)
    df_te['penalty'] = df_te['penalty'].parallel_apply(lambda x:x-1)
    
    df_tr.to_csv('./data/train_splited.csv', sep='\t', index=False)
    df_te.to_csv('./data/test_splited.csv', sep='\t', index=False)
else:
    df_tr = pd.read_csv(train_path, sep='\t', nrows=nrows)
    df_te = pd.read_csv(test_path, sep='\t', nrows=nrows)

In [None]:
# 查看列信息
df_tr.columns

In [None]:
df_tr.shape

In [None]:
df_tr.info()

In [9]:
# 将训练集和测试集结合起来
df_all = pd.concat([df_tr, df_te])

In [10]:
# 定义罚金类别的评估函数
from sklearn.metrics import f1_score
def micro_avg_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='micro')

In [11]:
# 从训练集和测试集中的文本训练
from sklearn.feature_extraction.text import TfidfVectorizer
# 当设置为浮点数时，过滤出现在超过max_df/低于min_df比例的句子中的词语；正整数时,则是超过max_df句句子
tfv = TfidfVectorizer(#analyzer='word',
                      # analyzer是word时默认只匹配长度大于2的词，且自动屏蔽标点， 这回导致文本中的罚金1,000和小数 1.5被分开
                      # \w+ 这里能够取出单个的字 但仍然匹配不到小数 待优化 先跑通baseline
                      token_pattern=r'(?u)\b\w+\b',             
                      ngram_range=(1,3),
                      min_df=3, max_df=0.95)

In [12]:
# 使用全部数据进行tfidf的转换 不合适 应当使用训练集的数据提取转换器
# tfv.fit(df_all.text.values.tolist())
tfv.fit(df_tr.text.values.tolist())

TfidfVectorizer(max_df=0.95, min_df=3, ngram_range=(1, 3),
                token_pattern='(?u)\\b\\w+\\b')

In [13]:
train_x = tfv.transform(df_tr.text.values.tolist())      # 将输入文本转换为tf-idf表示形式
train_y = df_tr.penalty.values
# penalty_classes = len(df_tr.penalty.unique())            # 罚金类别种类

In [14]:
import time

def log(stri):
    now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print(str(now) + ' ' + str(stri))

In [15]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

def model_cv(model, skf, train_x, train_y):
    penalty_classes = len(set(train_y))
    stack = np.zeros((train_x.shape[0], penalty_classes))
    score_va = 0

    for i, (tr, va) in enumerate(skf.split(train_x, train_y)):
        log('stack: %d/%d'%((i+1), n_folds))
        model.fit(train_x[tr], train_y[tr])
        predict_va = model.predict_proba(train_x[va])         # 划分出的验证集预测各类别概率
        log('va acc:%f' % micro_avg_f1(train_y[va], model.predict(train_x[va])))         # 求出验证集的预测精度
        score_va += micro_avg_f1(train_y[va], model.predict(train_x[va]))                # 验证集的f1-score， micro
        stack[va] += predict_va

    score_va /= n_folds
    log('va avg acc:%f' % score_va)

    return stack

n_folds = 5
skf = StratifiedKFold(n_splits=n_folds)

In [None]:
lr = LogisticRegression(C=2, n_jobs=-1, solver='sag', multi_class='ovr')
stack_lr = model_cv(lr, skf, train_x, train_y)

df_stack = pd.DataFrame(index=range(len(stack_lr)))
for i in range(stack_lr.shape[1]):
    df_stack['tfidf_lr_{}'.format(i)] = stack_lr[:, i]

df_stack.to_csv('./data/lr_prob.csv', index=None, encoding='utf8')

In [None]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
stack_bnb = model_cv(bnb, skf, train_x, train_y)
df_stack = pd.DataFrame(index=range(len(stack_bnb)))
for i in range(stack_bnb.shape[1]):
    df_stack['tfidf_svc_{}'.format(i)] = stack_bnb[:, i]

df_stack.to_csv('./data/bnb_prob.csv', index=None, encoding='utf8')

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
stack_mnb = model_cv(mnb, skf, train_x, train_y)
df_stack = pd.DataFrame(index=range(len(stack_mnb)))
for i in range(stack_mnb.shape[1]):
    df_stack['tfidf_mnb_{}'.format(i)] = stack_mnb[:, i]

df_stack.to_csv('./data/mnb_prob.csv', index=None, encoding='utf8')

In [17]:
from sklearn import svm

svc = svm.LinearSVC(loss='hinge', tol=0.000001, C=0.5, verbose=1, random_state=2020, max_iter=5000)
classes = len(set(train_y))
stack = np.zeros((train_x.shape[0], classes))
score_va = 0

for i, (tr, va) in enumerate(skf.split(train_x, train_y)):
    log('stack: %d/%d'%((i+1), n_folds))
    svc.fit(train_x[tr], train_y[tr])
    predict_va = svc.decision_function(train_x[va])         # 划分出的验证集预测各类别概率
    log('va acc:%f' % micro_avg_f1(train_y[va], svc.predict(train_x[va])))         # 求出验证集的预测精度
    score_va += micro_avg_f1(train_y[va], svc.predict(train_x[va]))                # 验证集的f1-score， micro
    stack[va] += predict_va

score_va /= n_folds
log('va avg acc:%f' % score_va)
stack_svc = stack
df_stack = pd.DataFrame(index=range(len(stack_svc)))
for i in range(stack_svc.shape[1]):
    df_stack['tfidf_svc_{}'.format(i)] = stack_svc[:, i]

df_stack.to_csv('./data/svc_prob.csv', index=None, encoding='utf8')

2020-09-21 16:52:30 stack: 1/5
[LibLinear]



2020-09-21 17:44:01 va acc:0.631680
2020-09-21 17:44:01 stack: 2/5
[LibLinear]2020-09-21 18:41:02 va acc:0.517656
2020-09-21 18:41:03 stack: 3/5
[LibLinear]2020-09-21 19:41:32 va acc:0.523516
2020-09-21 19:41:32 stack: 4/5
[LibLinear]2020-09-21 20:38:38 va acc:0.509805
2020-09-21 20:38:39 stack: 5/5
[LibLinear]2020-09-21 21:31:49 va acc:0.511914
2020-09-21 21:31:49 va avg acc:0.538914


In [36]:
# 从文本中提取统计信息， 使用正则表达式从案件的文本中 提取涉及到的所有金额， 求出所有金额数据的统计信息
# 包括：求和， 最大值， 最小值， 最大最小差值， 平均值， 标准差
# TODO：案件文本的词的个数（长度）, 酒驾、毒品等关键词，日期、地点等关键词

import re
import numpy as np

train_raw = pd.read_csv('./data/train.csv', sep='\t')
train_raw = train_raw.reset_index(drop=True)

amt_list = []
for i, row in train_raw.iterrows():
    if i % 1000 == 1:
        log('iter = %d' % i)
    amt = re.findall(u'(\d*\.?\d+)元', row['text'])
    amt_tt = re.findall(u'(\d*\.?\d+)万元', row['text'])
    for a in amt:
        amt_list.append([row['ID'], float(a)])
    for a in amt_tt:
        amt_list.append([row['ID'], float(a) * 10000])
amt_feat = pd.DataFrame(amt_list, columns=['ID', 'amount'])
amt_feat = amt_feat.groupby('ID')['amount'].agg([sum, min, max, np.ptp, np.mean, np.std]).reset_index()
amt_feat = pd.merge(train_raw, amt_feat, how='left', on='ID').drop(['ID', 'text'], axis=1)
amt_feat.columns = ['amt_' + i for i in amt_feat.columns]

amt_feat.to_csv('./data/amt.csv', index=None)

2020-09-21 22:35:37 iter = 1
2020-09-21 22:35:37 iter = 1001
2020-09-21 22:35:37 iter = 2001
2020-09-21 22:35:38 iter = 3001
2020-09-21 22:35:38 iter = 4001
2020-09-21 22:35:38 iter = 5001
2020-09-21 22:35:39 iter = 6001
2020-09-21 22:35:39 iter = 7001
2020-09-21 22:35:39 iter = 8001
2020-09-21 22:35:39 iter = 9001
2020-09-21 22:35:40 iter = 10001
2020-09-21 22:35:40 iter = 11001
2020-09-21 22:35:40 iter = 12001
2020-09-21 22:35:41 iter = 13001
2020-09-21 22:35:41 iter = 14001
2020-09-21 22:35:41 iter = 15001
2020-09-21 22:35:42 iter = 16001
2020-09-21 22:35:42 iter = 17001
2020-09-21 22:35:42 iter = 18001
2020-09-21 22:35:43 iter = 19001
2020-09-21 22:35:43 iter = 20001
2020-09-21 22:35:43 iter = 21001
2020-09-21 22:35:44 iter = 22001
2020-09-21 22:35:44 iter = 23001
2020-09-21 22:35:44 iter = 24001
2020-09-21 22:35:45 iter = 25001
2020-09-21 22:35:45 iter = 26001
2020-09-21 22:35:45 iter = 27001
2020-09-21 22:35:45 iter = 28001
2020-09-21 22:35:46 iter = 29001
2020-09-21 22:35:46 ite