In [1]:
'''
feature_engineering.py
'''
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif, chi2
import warnings
warnings.filterwarnings('ignore')

labels = ['交通肇事', '信用卡诈骗' ,'危险驾驶', '受贿' ,'合同诈骗', '妨害公务', '容留他人吸毒' ,'寻衅滋事','开设赌场',
 '引诱、容留、介绍卖淫', '抢劫', '抢夺', '掩饰、隐瞒犯罪所得、犯罪所得收益', '故意伤害', '故意杀人', '故意毁坏财物', '敲诈勒索',
 '滥伐林木' ,'生产、销售假药', '盗窃', '组织、强迫、引诱、容留、介绍卖淫', '职务侵占','诈骗' ,'贪污', '赌博',
 '走私、贩卖、运输、制造毒品' ,'过失致人死亡' ,'非法拘禁', '非法持有、私藏枪支、弹药' ,'非法持有毒品']

def vsm_tfidf(max_features=5000, max_df=0.8, analyzer='word', ngram_range=(1,1)):
    print('>>> reading...')
    train = pd.read_csv('../data/train.csv')
    test = pd.read_csv('../data/test.csv')
    print('train size:', train.shape)
    print('test size:', test.shape)

    print('>>> tf idf featuring...')
    if analyzer == 'char':
        train.words = train.words.apply(lambda x: x.replace(' ', ''))
        test.words = test.words.apply(lambda x: x.replace(' ', ''))
    tfidfer = TfidfVectorizer(max_features=max_features, max_df=max_df, analyzer=analyzer, ngram_range=ngram_range)
    tfidfer.fit(train.words)
    feature_names = tfidfer.get_feature_names()
    tfidf = tfidfer.transform(train.words)
    train_feature = tfidf.toarray()
    tfidf = tfidfer.transform(test.words)
    test_feature = tfidf.toarray()
    print('tfidfer feature:', feature_names[:20])
    print('train feature shape:', train_feature.shape)
    print('test feature shape:', test_feature.shape)
    
    print('>>> feature selecting...')
    selector = SelectKBest(chi2, k=500)
    train_feature = selector.fit_transform(train_feature, train['accusation'])
    selected_features = selector.inverse_transform(train_feature)
    selected_columns = np.where(~(selector.inverse_transform(train_feature) == 0).all(axis=0))[0]
    test_feature = test_feature[:, selected_columns]
    print('selected feature:', [feature_names[i] for i in selected_columns])
    print('train feature selected shape:', train_feature.shape)
    print('test feature selected shape:', test_feature.shape)
    
    return train, test, train_feature, test_feature, feature_names, labels

In [2]:
'''
classifiers.py
'''
import functools, time
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb

def logtime(func):
    @functools.wraps(func)
    def wrapper(*args, **kw):
        time1 = time.time()
        result = func(*args, **kw)
        time2 = time.time()
        print(func.__name__, 'time cost:', round(time2-time1, 4), 's')
        return result
    return wrapper

class Classifier():
    def __init__(self):
        self.model = None
    
    @logtime
    def train(self, X, y):
        self.model.fit(X, y)
        
    @logtime
    def predict(self, X):
        return self.model.predict_proba(X)

class LR(Classifier):
    name = 'Logistic Regression'
    def __init__(self, C=1.0, penalty='l2', solver='saga'):
        self.model = OneVsRestClassifier(LogisticRegression(solver=solver, 
                                                            C=C, 
                                                            penalty=penalty,
                                                            n_jobs=-1))
        
class SVM(Classifier):
    name = 'SVM'
    def __init__(self, C=1.0, kernel='rbf', probability=True):
        self.model = OneVsRestClassifier(SVC(C=C,
                                             kernel=kernel, 
                                             probability=probability))
        
class KNN(Classifier):
    name = 'KNN'
    def __init__(self, n_neighbors=20, metric='euclidean', algorithm='ball_tree'):
        self.model = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=n_neighbors, 
                                                              metric=metric,
                                                              algorithm=algorithm,
                                                              n_jobs=-1)) 
class DT(Classifier):
    name = 'Decision Tree'
    def __init__(self):
        self.model = OneVsRestClassifier(DecisionTreeClassifier(max_depth=3,
                                                                min_samples_leaf=5))
        
class NB(Classifier):
    name = 'NN'
    def __init__(self):
        self.model = OneVsRestClassifier(GaussianNB())
        
class LGB(Classifier):
    name = 'LGB'
    def __init__(self):
        self.model = OneVsRestClassifier(lgb.LGBMClassifier(num_leaves=2**5,
                                                            reg_alpha=0.25,
                                                            reg_lambda=0.25,
                                                            max_depth=-1, 
                                                            learning_rate=0.05,
                                                            min_child_sample=5,
                                                            n_estimators=2000,
                                                            subsample=0.9,
                                                            colsample_bytree=0.7, 
                                                            objective='binary',
                                                            silent=200))

In [3]:
'''
ml_methods_exp.py
'''
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, classification_report
# from classifiers import *
# from feature_engineering import *
import warnings
warnings.filterwarnings('ignore')

avgf1_scorer = lambda x,y: (f1_score(x, y, average='macro') + f1_score(x, y, average='micro')) / 2

def experiments(analyzer='word', ngram_range=(1,1)):
    print('==================================')
    print('analyzer=%s, ngram_range=%s' %(analyzer, str(ngram_range)))
    print('==================================')
    
    train, test, train_feature, test_feature, feature_names, labels = vsm_tfidf(analyzer=analyzer, ngram_range=ngram_range)

    classifiers = [LR(), SVM(), KNN(), NB(), DT(), LGB()]
    for model in classifiers:
        print('-----------------------%s-----------------------' % model.name)
        print('>>> training...')
        model.train(train_feature, train[list(map(str, range(30)))].values)

        print('>>> predicting...')
        pred_y_prob = model.predict(test_feature)
        up75 = np.percentile(test[list(map(str, range(30)))].values.sum(axis=0), q=75)
        mask = test[list(map(str, range(30)))].values.sum(axis=0) > up75
        big_class_idx, small_class_idx = np.where(mask)[0], np.where(~mask)[0]
        pred_y_prob_mod = pred_y_prob.copy()
        pred_y_prob_mod[:, big_class_idx] = (pred_y_prob[:, big_class_idx] >= 00.32).astype(int)
        pred_y_prob_mod[:, small_class_idx] = (pred_y_prob[:, small_class_idx] >= 0.18).astype(int)

        print('>>> evaluating...')
        report = classification_report(test[list(map(str, range(30)))].values, pred_y_prob_mod, target_names=[x[:3]+'..\t' for x in labels])
        print(report)
        score = avgf1_scorer(test[list(map(str, range(30)))].values, pred_y_prob_mod)
        print('avg f1 score:', score)

if __name__ == '__main__':
    experiments(analyzer='word', ngram_range=(1,1))
    experiments(analyzer='char', ngram_range=(2,2))

analyzer=word, ngram_range=(1, 1)
>>> reading...
train size: (32171, 36)
test size: (3575, 36)
>>> tf idf featuring...
tfidfer feature: ['一万元', '一个月', '一串', '一事', '一人', '一件', '一份', '一伙', '一刀', '一副', '一包', '一区', '一双', '一只', '一台', '一号', '一同', '一名', '一块', '一处']
train feature shape: (32171, 5000)
test feature shape: (3575, 5000)
>>> feature selecting...
selected feature: ['一包', '一小包', '一张', '一支', '一级', '一部', '万元', '万州区', '万某', '三个', '上报', '不备', '不锈钢', '业务员', '两支', '中信银行', '中国农业银行', '中国工商银行', '中国建设银行', '中国银行', '丹阳市', '为名', '主任', '主管部门', '之机', '乙醇', '书记', '争吵', '事故', '事故现场', '二七区', '二级', '二轮', '交往', '交易', '交通事故', '交通警察', '交通银行', '产品', '亲属', '人体', '人员', '人民币', '人民政府', '人身自由', '介绍', '代某', '以能', '价值', '价格', '价钱', '任意', '任职', '伟哥', '传销', '伤情', '伪造', '位于', '低于', '使用暴力', '侵占', '侵吞', '便利', '俗称', '保健品', '信任', '信函', '信用', '信用卡', '修路', '借款', '债务', '假药', '偿还', '催收', '催缴', '党支部', '公司', '公款', '六合彩', '关照', '内容', '冒用', '农用车', '农药', '冰毒', '净重', '击发', '分行', '分许', '利息', '利用', '利益', '制式', '剥夺', '办理', '动力', '劫取

train time cost: 31.7358 s
>>> predicting...
predict time cost: 2.6512 s
>>> evaluating...
             precision    recall  f1-score   support

     交通肇..	       0.30      0.97      0.46       367
     信用卡..	       0.12      0.90      0.21        59
     危险驾..	       0.51      0.99      0.67       768
      受贿..	       0.20      0.86      0.33        28
     合同诈..	       0.03      0.67      0.06        15
     妨害公..	       0.02      0.93      0.04        41
     容留他..	       0.36      0.99      0.52       154
     寻衅滋..	       0.07      0.89      0.13        88
     开设赌..	       0.12      0.92      0.21        26
     引诱、..	       0.20      0.81      0.31        21
      抢劫..	       0.05      0.94      0.10        65
      抢夺..	       0.06      0.83      0.12        24
     掩饰、..	       0.02      0.88      0.04        24
     故意伤..	       0.53      0.98      0.69       467
     故意杀..	       0.04      0.75      0.07        20
     故意毁..	       0.03      0.76      0.06        17
     敲诈

tfidfer feature: ['.万', '.上', '.元', '.克', '.公', '.属', '.年', '.毫', '.立', '.系', '.被', '.醉', '一个', '一事', '一人', '一件', '一份', '一包', '一医', '一只']
train feature shape: (32171, 5000)
test feature shape: (3575, 5000)
>>> feature selecting...
selected feature: ['.万', '.元', '.克', '.立', '一包', '一小', '一张', '一支', '一部', '万元', '万某', '不备', '丙胺', '业务', '业局', '业银', '中乙', '中信', '中国', '中容', '中酒', '丹阳', '为己', '主任', '之机', '乙醇', '书记', '争吵', '争执', '事故', '二级', '二轮', '亡交', '交易', '交警', '交通', '产品', '亲属', '人代', '人吸', '人员', '人死', '人轻', '人近', '介绍', '代某', '价值', '价格', '份有', '伐林', '伐许', '众赌', '伤二', '伤力', '伤害', '伤情', '伤死', '伤程', '伤轻', '伤鉴', '伪造', '似毒', '似物', '体损', '体致', '供赌', '便利', '俗称', '保健', '信任', '信用', '信银', '借款', '债务', '值人', '值元', '假药', '偿还', '催收', '元价', '克检', '克毒', '克甲', '公务', '公司', '六合', '内容', '冰毒', '净重', '出乙', '出所', '出甲', '分行', '利人', '利息', '利用', '利益', '利目', '办理', '务便', '动力', '动车', '勤民', '包净', '包毒', '医鉴', '卖吸', '卖毒', '卖淫', '博抽', '博机', '占为', '卡一', '卡卡', '卡号', '卡透', '卡银', '原油', '厮打', '参赌', '发卡', '发射', '发生', '发还', '取利', 

train time cost: 34.412 s
>>> predicting...
predict time cost: 2.6151 s
>>> evaluating...
             precision    recall  f1-score   support

     交通肇..	       0.35      0.98      0.52       367
     信用卡..	       0.16      0.92      0.28        59
     危险驾..	       0.57      0.99      0.73       768
      受贿..	       0.17      0.82      0.28        28
     合同诈..	       0.03      0.80      0.06        15
     妨害公..	       0.03      0.88      0.05        41
     容留他..	       0.38      0.98      0.55       154
     寻衅滋..	       0.07      0.88      0.13        88
     开设赌..	       0.16      0.92      0.28        26
     引诱、..	       0.24      0.90      0.38        21
      抢劫..	       0.05      0.95      0.10        65
      抢夺..	       0.06      0.83      0.11        24
     掩饰、..	       0.03      0.96      0.06        24
     故意伤..	       0.64      0.99      0.77       467
     故意杀..	       0.04      0.75      0.07        20
     故意毁..	       0.03      0.71      0.05        17
     敲诈勒