In [None]:
import preprocess
import pandas as pd
import numpy as np
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

In [None]:
train_path = './data/train_splited.csv'
test_path = './data/test_splited.csv'
import os
nrows = None
if not os.path.exists(train_path) or not os.path.exists(test_path):
    # 加载训练集和测试集
    df_tr, df_te = preprocess.load_data(nrows=None)
    
    df_tr['penalty'] = df_tr['penalty'].parallel_apply(lambda x:x-1)
    df_te['penalty'] = df_te['penalty'].parallel_apply(lambda x:x-1)
    
    df_tr.to_csv('./data/train_splited.csv', sep='\t', index=False)
    df_te.to_csv('./data/test_splited.csv', sep='\t', index=False)
else:
    df_tr = pd.read_csv(train_path, sep='\t', nrows=nrows)
    df_te = pd.read_csv(test_path, sep='\t', nrows=nrows)

In [None]:
# 查看列信息
df_tr.columns

In [None]:
df_tr.shape

In [None]:
df_tr.info()

In [None]:
# 将训练集和测试集结合起来
df_all = pd.concat([df_tr, df_te])

In [None]:
# 定义罚金类别的评估函数
from sklearn.metrics import f1_score
def micro_avg_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='micro')

In [None]:
# 从训练集和测试集中的文本训练
from sklearn.feature_extraction.text import TfidfVectorizer
# 当设置为浮点数时，过滤出现在超过max_df/低于min_df比例的句子中的词语；正整数时,则是超过max_df句句子
tfv = TfidfVectorizer(#analyzer='word',
                      # analyzer是word时默认只匹配长度大于2的词，且自动屏蔽标点， 这回导致文本中的罚金1,000和小数 1.5被分开
                      # \w+ 这里能够取出单个的字 但仍然匹配不到小数 待优化 先跑通baseline
                      token_pattern=r'(?u)\b\w+\b',             
                      ngram_range=(1,3),
                      min_df=3, max_df=0.95)

In [None]:
# 使用全部数据进行tfidf的转换 不合适 应当使用训练集的数据提取转换器
# tfv.fit(df_all.text.values.tolist())
tfv.fit(df_tr.text.values.tolist())

In [None]:
train_x = tfv.transform(df_tr.text.values.tolist())      # 将输入文本转换为tf-idf表示形式
train_y = df_tr.penalty.values
# penalty_classes = len(df_tr.penalty.unique())            # 罚金类别种类

In [None]:
import time

def log(stri):
    now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print(str(now) + ' ' + str(stri))

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

def model_cv(model, skf, train_x, train_y):
    penalty_classes = len(set(train_y))
    stack = np.zeros((train_x.shape[0], penalty_classes))
    score_va = 0

    for i, (tr, va) in enumerate(skf.split(train_x, train_y)):
        log('stack: %d/%d'%((i+1), n_folds))
        model.fit(train_x[tr], train_y[tr])
        predict_va = model.predict_proba(train_x[va])         # 划分出的验证集预测各类别概率
        log('va acc:%f' % micro_avg_f1(train_y[va], model.predict(train_x[va])))         # 求出验证集的预测精度
        score_va += micro_avg_f1(train_y[va], model.predict(train_x[va]))                # 验证集的f1-score， micro
        stack[va] += predict_va

    score_va /= n_folds
    log('va avg acc:%f' % score_va)

    return stack

n_folds = 5
skf = StratifiedKFold(n_splits=n_folds)

In [None]:
lr = LogisticRegression(C=2, n_jobs=-1, solver='sag', multi_class='ovr')
stack_lr = model_cv(lr, skf, train_x, train_y)

df_stack = pd.DataFrame(index=range(len(stack_lr)))
for i in range(stack_lr.shape[1]):
    df_stack['tfidf_lr_{}'.format(i)] = stack_lr[:, i]

df_stack.to_csv('./data/lr_prob.csv', index=None, encoding='utf8')

In [None]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
stack_bnb = model_cv(bnb, skf, train_x, train_y)
df_stack = pd.DataFrame(index=range(len(stack_bnb)))
for i in range(stack_bnb.shape[1]):
    df_stack['tfidf_svc_{}'.format(i)] = stack_bnb[:, i]

df_stack.to_csv('./data/bnb_prob.csv', index=None, encoding='utf8')

In [None]:
df_stack.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
stack_mnb = model_cv(mnb, skf, train_x, train_y)
df_stack = pd.DataFrame(index=range(len(stack_mnb)))
for i in range(stack_mnb.shape[1]):
    df_stack['tfidf_mnb_{}'.format(i)] = stack_mnb[:, i]

df_stack.to_csv('./data/mnb_prob.csv', index=None, encoding='utf8')

In [None]:
from sklearn import svm

svc = svm.LinearSVC(loss='hinge', tol=0.000001, C=0.5, verbose=1, random_state=2020, max_iter=5000)
classes = len(set(train_y))
stack = np.zeros((train_x.shape[0], classes))
score_va = 0

for i, (tr, va) in enumerate(skf.split(train_x, train_y)):
    log('stack: %d/%d'%((i+1), n_folds))
    svc.fit(train_x[tr], train_y[tr])
    predict_va = svc.decision_function(train_x[va])         # 划分出的验证集预测各类别概率
    log('va acc:%f' % micro_avg_f1(train_y[va], svc.predict(train_x[va])))         # 求出验证集的预测精度
    score_va += micro_avg_f1(train_y[va], svc.predict(train_x[va]))                # 验证集的f1-score， micro
    stack[va] += predict_va

score_va /= n_folds
log('va avg acc:%f' % score_va)
stack_svc = stack
df_stack = pd.DataFrame(index=range(len(stack_svc)))
for i in range(stack_svc.shape[1]):
    df_stack['tfidf_svc_{}'.format(i)] = stack_svc[:, i]

df_stack.to_csv('./data/svc_prob.csv', index=None, encoding='utf8')

In [None]:
# 从文本中提取统计信息， 使用正则表达式从案件的文本中 提取涉及到的所有金额， 求出所有金额数据的统计信息
# 包括：求和， 最大值， 最小值， 最大最小差值， 平均值， 标准差
# TODO：案件文本的词的个数（长度）, 酒驾、毒品等关键词，日期、地点等关键词

import re
import numpy as np

train_raw = pd.read_csv('./data/train.csv', sep='\t')
train_raw = train_raw.reset_index(drop=True)

amt_list = []
for i, row in train_raw.iterrows():
    if i % 1000 == 1:
        log('iter = %d' % i)
    amt = re.findall(u'(\d*\.?\d+)元', row['text'])
    amt_tt = re.findall(u'(\d*\.?\d+)万元', row['text'])
    for a in amt:
        amt_list.append([row['ID'], float(a)])
    for a in amt_tt:
        amt_list.append([row['ID'], float(a) * 10000])
amt_feat = pd.DataFrame(amt_list, columns=['ID', 'amount'])
amt_feat = amt_feat.groupby('ID')['amount'].agg([sum, min, max, np.ptp, np.mean, np.std]).reset_index()
amt_feat = pd.merge(train_raw, amt_feat, how='left', on='ID').drop(['ID', 'text'], axis=1)
amt_feat.columns = ['amt_' + i for i in amt_feat.columns]

amt_feat.to_csv('./data/amt.csv', index=None)

In [None]:
import codecs

############################ 准备数据 ############################
doc_f = open('./data/d2v.txt', 'w', encoding='utf8')
for i, contents in enumerate(df_tr['text']):
    if i % 10000 == 0:
        log('iter = %d' % i)
    doc_f.write(u'_*{} {}\n'.format(i, contents))
doc_f.close()

class Doc_list(object):
    def __init__(self, f):
        self.f = f
    def __iter__(self):
        for i, line in enumerate(codecs.open(self.f, encoding='utf-8')):
            words = line.strip().split(' ')
            tags = [int(words[0][2:])]
            words = words[1:]
            yield SentimentDocument(words, tags)

In [None]:
### Doc2vec
from collections import namedtuple
from gensim.models import Doc2Vec
from sklearn.model_selection import cross_val_score

SentimentDocument = namedtuple('SentimentDocument', 'words tags')
########################## PV-DBOW Doc2Vec #########################
# 初始化Doc2Vec模型
# @vector_size: Dimensionality of the feature vectors.
# @window: The maximum distance between the current and predicted word within a sentence.
# @alpha: The initial learning rate.
# @dm: {1,0}, optional. Defines the training algorithm.
#             If `dm=1`, 'distributed memory' (PV-DM) is used.
#             Otherwise, `distributed bag of words` (PV-DBOW) is employed.
# 一篇文档转化成300 x 1的向量
d2v = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=3, window=30,
              sample=1e-5, workers=4, alpha=0.025,
              min_alpha=0.025
             )
# 数据生成器 Build vocabulary from a sequence of documents (can be a once-only generator stream).
doc_list = Doc_list('./data/d2v.txt')
d2v.build_vocab(doc_list)
# 文档的标签
df_lb = df_tr['penalty']

for i in range(5):
    log('pass: ' + str(i))
    doc_list = Doc_list('./data/d2v.txt')
    # 训练的doc2vec
    d2v.train(doc_list, total_examples=d2v.corpus_count, epochs=d2v.epochs)
    X_d2v = np.array([d2v.docvecs[i] for i in range(df_tr.shape[0])])
    scores = cross_val_score(LogisticRegression(C=2, n_jobs=-1, solver='sag', multi_class='ovr'),
                              X_d2v, df_lb,
                              cv=5
                             )
    log('d2v-dbow: ' + str(scores) + ' ' + str(np.mean(scores)))
d2v.save('./data/d2v-dbow.model')

In [None]:
########################## PV-DM Doc2Vec #########################

d2v_dm = Doc2Vec(dm=1, vector_size=300, negative=5, hs=0, min_count=3, window=30,
              sample=1e-5, workers=4, alpha=0.025,
              min_alpha=0.025
             )
# 数据生成器 Build vocabulary from a sequence of documents (can be a once-only generator stream).
doc_list = Doc_list('./data/d2v.txt')
d2v_dm.build_vocab(doc_list)
# 文档的标签
df_lb = df_tr['penalty']

for i in range(5):
    log('pass: ' + str(i))
    doc_list = Doc_list('./data/d2v.txt')
    # 训练的doc2vec
    d2v_dm.train(doc_list, total_examples=d2v_dm.corpus_count, epochs=d2v_dm.epochs)
    X_d2v = np.array([d2v.docvecs[i] for i in range(df_tr.shape[0])])
    scores = cross_val_score(LogisticRegression(C=2, n_jobs=-1, solver='sag', multi_class='ovr'),
                              X_d2v, df_lb,
                              cv=5
                             )
    log('doc2vec-dm: ' + str(scores) + ' ' + str(np.mean(scores)))
d2v_dm.save('./data/d2v-dm.model')

使用doc2vec 生成文档的向量特征 直接对向量特征使用逻辑回归拟合  平均得分为0.38

###  使用神经网络对文本的Doc2Vec表示进行拟合

In [None]:
# 定义一个模型
from gensim.models import Doc2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

class Model():
    def __init__(self):
        pass
        
    def get_model(self):
        model = Sequential()
        model.add(Dense(300, input_shape=(300,), activation='tanh'))
        model.add(Dropout(0.1))
        model.add(Dense(classes, activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='adadelta',
                      metrics=['acc'],
                     )
        return model

In [None]:
tmp = Model().get_model()
tmp.summary()

In [None]:
from tensorflow.python.client import device_lib
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

def check_device():
    # 查看有效的CPU和GPU
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "99"
    print(device_lib.list_local_devices())


def assign_device():
    # 指定使用GPU
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # 选用GPU序号
    config = ConfigProto()
    config.gpu_options.allow_growth = True
    session = InteractiveSession(config=config)

check_device()
assign_device()

In [None]:
# 定义keras的数据生成器
from tensorflow import keras
from tensorflow.keras.utils import to_categorical


class DataGenerator(keras.utils.Sequence):
    def __init__(self, d2v_vectors, indexes, y, batch_size=32):
        '''Initialization'''
        self.d2v_vectors = d2v_vectors
        self.indexes = indexes
        self.y = y
        self.batch_size = batch_size

    def __len__(self):
        ''' Denotes the number of batches per epoch'''
        # 必须进行整型转换
        return int(np.floor(len(self.indexes) / self.batch_size))


    # 一个batch的数据处理，返回需要feed到模型中训练的数据
    def __getitem__(self, index):
        '''Generate one batch of data'''
        # Generate indexes of the batch
        indexes = [self.indexes[index] for index in range(index*self.batch_size, (index+1)*self.batch_size)]

        # Get inputs and labels from original data
        # 从模型中提取文本的向量化表示
        x = [self.d2v_vectors[index] for index in indexes]
        # 转成ndarray
        x = np.array(x)
        y = [self.y[index] for index in indexes]
        y = np.array(y)

        return x, y

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow import keras

def d2v_nn(docvecs, y, feat, num_class, skf, batch_size, epochs, df_tr):

    stack_d2v = np.zeros(y.shape)
    score_va = 0

    for i, (tr, va) in enumerate(skf.split(df_tr.text, df_tr.penalty)):
        log('stack %s: %d/%d' % (feat, i+1, skf.get_n_splits()))

        train_gen = DataGenerator(docvecs, tr, y, batch_size=batch_size)
        va_gen = DataGenerator(docvecs, va, y, batch_size=batch_size)

        model = Model().get_model()
        callbacks_list = [
            keras.callbacks.EarlyStopping(
                monitor='val_acc',
                patience=2,
                mode='max'
            ),
            keras.callbacks.ModelCheckpoint(
                filepath= feat + '.h5',
                monitor='val_acc',
                save_best_only=True,
            )]
        history = model.fit(train_gen,
                            batch_size=batch_size,
                            epochs=epochs,
                            verbose=2,
                            validation_data=va_gen,
                            callbacks=callbacks_list
                           )

        y_pred_va = model.predict(va_gen)
        print(y_pred_va.shape)
        log('va acc: %f' % micro_avg_f1(df_tr.penalty[va], np.argmax(model.predict(va_gen), axis=-1)))
        score_va += micro_avg_f1(df_tr.penalty[va], np.argmax(model.predict(va_gen), axis=-1))
        stack_d2v[va] += y_pred_va

    score_va /= n
    log('va avg acc: %f' % score_va)
    for l in range(stack_d2v.shape[1]):
        df_stack['{}_{}'.format(feat, l)] = stack_d2v[:, l]

    return df_stack

In [None]:
batch_size = 200
epochs = 1000
n = 5

feat = 'd2v_dbow'

# 加载训练完成的Doc2Vec-dbow模型
d2v_dbow = Doc2Vec.load('./data/d2v-dbow.model')
dbow_docvecs = d2v_dbow.docvecs
y = to_categorical(df_tr.penalty, classes)

stack_d2v_dbow = d2v_nn(dbow_docvecs, y, feat, classes, skf, batch_size, epochs, df_tr)
stack_d2v_dbow.to_csv('./data/d2v-dbow.csv', sep='\t')

In [None]:
# 加载训练完成的Doc2Vec-dbow模型
d2v_dbow = Doc2Vec.load('./data/d2v-dm.model')
dbow_docvecs = d2v_dbow.docvecs
y = to_categorical(df_tr.penalty, classes)

stack_d2v_dbow = d2v_nn(dbow_docvecs, y, feat, classes, skf, batch_size, epochs, df_tr)
stack_d2v_dbow.to_csv('./data/d2v-dm.csv', sep='\t')

In [None]:
stack_d2v_dbow.shape

## Word2Vec 模型

In [None]:
# 训练word2vec词向量
from gensim.models import Word2Vec
from collections import defaultdict

documents = df_tr.text.values
texts = [[word for word in document.split(' ')] for document in documents]

frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] >= 5] for text in texts]

log('Train Model...')
w2v = Word2Vec(texts, size=100, window=5, iter=15, seed=2020)
log('Done')
w2v.save('./data/w2v.model')

#### 对一篇文档中词的Word2vec向量求和再求均值

In [None]:
w2v_feat = np.zeros((len(texts), 100))
w2v_feat_avg = np.zeros((len(texts), 100))
i = 0
for text in texts:
    for token in text:
        vec = w2v[token]
        w2v_feat[i, :] += vec
    w2v_feat_avg[i, :] = w2v_feat[i, :] / len(text)
    i += 1
    if i % 1000 == 0:
        log('Vectorizing document with W2V %d' % i)
        
df_w2v = pd.DataFrame(w2v_feat)
df_w2v.columns = ['w2v_' + str(i) for i in df_w2v.columns]
df_w2v.to_csv('./data/w2v_feat.csv', encoding='utf8', index=None)
df_w2v_avg = pd.DataFrame(w2v_feat_avg)
df_w2v_avg.columns = ['w2v_avg_' + str(i) for i in df_w2v_avg.columns]
df_w2v_avg.to_csv('./data/w2v_avg_feat.csv', encoding='utf8', index=None)

log('Save w2v and w2v_avg feat done!')

In [None]:
tfidf_lr = pd.read_csv('./data/lr_prob.csv')
tfidf_bnb = pd.read_csv('./data/bnb_prob.csv')
tfidf_mnb = pd.read_csv('./data/mnb_prob.csv')
tfidf_svc = pd.read_csv('./data/svc_prob.csv')
amt = pd.read_csv('./data/amt.csv')
amt = amt.drop(['amt_items'], axis=1)
dbow_nn = pd.read_csv('./data/d2v-dbow.csv', sep='\t')
dm_nn = pd.read_csv('./data/d2v-dm.csv', sep='\t')
w2v_sum = pd.read_csv('./data/w2v_feat.csv')
w2v_avg = pd.read_csv('./data/w2v_avg_feat.csv')

df = pd.concat([tfidf_lr, tfidf_bnb, tfidf_mnb,
                tfidf_svc, tfidf_svc, amt,
                dbow_nn, dm_nn, w2v_sum,
                w2v_avg], axis=1
              )
df.shape

In [None]:
import xgboost as xgb
from sklearn.metrics import f1_score

In [None]:
max_depth = 7
min_child_weight = 1
subsample = 0.8
colsample_bytree = 0.8
gamma = 1
lam = 0

n_trees = 10000
esr = 200
evals = 20

params = {
    'objective': 'multi:softmax',
    'booster': 'gbtree',
    'stratified': True,
    'num_class': num_class,
    'max_depth': max_depth,
    'min_child_weight': min_child_weight,
    'subsample': subsample,
    'colsample_bytree': colsample_bytree,
#     'gamma': gamma,
#     'lambda': lam,

    'eta': 0.02,
    'silent': 1,
    'seed': seed,
}

In [None]:
from sklearn.model_selection import train_test_split

y = df_tr.penalty.values

In [None]:
train_x, test_x, train_y, test_y = train_test_split(df, y)

In [None]:
# TODO: Need debug

import xgboost as xgb
dtrain = xgb.DMatrix(train_x, train_y)
dvalid = xgb.DMatrix(test_x, test_y)
watchlist = [(dtrain, 'train'), (dvalid, 'test')]
bst = xgb.train(params, dtrain, n_trees, evals=watchlist, feval=micro_avg_f1, maximize=True,
                early_stopping_rounds=esr, verbose_eval=evals)
y_pred = bst.predict(dvalid).astype(int)

print(micro_avg_f1(test_y, y_pred))