In [None]:
import torch as t
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import Parameter
import numpy as np

class NEG_loss(nn.Module):
    def __init__(self, num_classes, embed_size, weights=None):
        """
        :param num_classes: An int. The number of possible classes.
        :param embed_size: An int. EmbeddingLockup size
        :param num_sampled: An int. The number of sampled from noise examples
        :param weights: A list of non negative floats. Class weights. None if
            using uniform sampling. The weights are calculated prior to
            estimation and can be of any form, e.g equation (5) in [1]
        """
        super(NEG_loss, self).__init__()

        self.num_classes = num_classes
        self.embed_size = embed_size

        self.out_embed = nn.Embedding(self.num_classes, self.embed_size, sparse=True)
        self.out_embed.weight = Parameter(t.FloatTensor(self.num_classes, self.embed_size).uniform_(-1, 1))

        self.in_embed = nn.Embedding(self.num_classes, self.embed_size, sparse=True)
        self.in_embed.weight = Parameter(t.FloatTensor(self.num_classes, self.embed_size).uniform_(-1, 1))

        self.weights = weights
        if self.weights is not None:
            assert min(self.weights) >= 0, "Each weight should be >= 0"

            self.weights = Variable(t.from_numpy(weights)).float()

            
    def sample(self, num_sample):
        """
        draws a sample from classes based on weights
        """
        return t.multinomial(self.weights, num_sample, True)

    
    def forward(self, input_labes, out_labels, num_sampled):
        """
        :param input_labes: Tensor with shape of [batch_size] of Long type
        :param out_labels: Tensor with shape of [batch_size, window_size] of Long type
        :param num_sampled: An int. The number of sampled from noise examples
        :return: Loss estimation with shape of [1]
            loss defined in Mikolov et al. Distributed Representations of Words and Phrases and their Compositionality
            papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf
        """

        use_cuda = self.out_embed.weight.is_cuda
        [batch_size, window_size] = out_labels.size()
        input = self.in_embed(input_labes.repeat(1, window_size).contiguous().view(-1))
        output = self.out_embed(out_labels.contiguous().view(-1))

        if self.weights is not None:
            noise_sample_count = batch_size * window_size * num_sampled
            draw = self.sample(noise_sample_count)
            noise = draw.view(batch_size * window_size, num_sampled)
        else:
            noise = Variable(t.Tensor(batch_size * window_size, num_sampled).
                             uniform_(0, self.num_classes - 1).long())

        if use_cuda:
            noise = noise.cuda()
        noise = self.out_embed(noise).neg()

        log_target = (input * output).sum(1).squeeze().sigmoid().log()

        ''' ∑[batch_size * window_size, num_sampled, embed_size] * [batch_size * window_size, embed_size, 1] ->
            ∑[batch_size, num_sampled, 1] -> [batch_size] '''
        sum_log_sampled = t.bmm(noise, input.unsqueeze(2)).sigmoid().log().sum(1).squeeze()
        loss = log_target + sum_log_sampled
        return -loss.sum() / batch_size

    
    def input_embeddings(self):
        return self.in_embed.weight.data.cpu().numpy()

In [None]:
class NegativeSamplingLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, input_vectors, output_vectors, noise_vectors):
        batch_size, embed_size = input_vectors.shape
        # Input vectors should be a batch of column vectors
        input_vectors = input_vectors.view(batch_size, embed_size, 1)
        
        # Output vectors should be a batch of row vectors
        output_vectors = output_vectors.view(batch_size, 1, embed_size)
        
        # bmm = batch matrix multiplication
        # correct log-sigmoid loss
        out_loss = torch.bmm(output_vectors, input_vectors).sigmoid().log()
        out_loss = out_loss.squeeze()
        
        # incorrect log-sigmoid loss
        noise_loss = torch.bmm(noise_vectors.neg(), input_vectors).sigmoid().log()
        noise_loss = noise_loss.squeeze().sum(1)  # sum the losses over the sample of noise vectors

        # negate and sum correct and noisy log-sigmoid losses
        # return average batch loss
        return -(out_loss + noise_loss).mean()

In [3]:
import networkx as nx
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn as nn
import copy
# from graphviz import Digraph
from torch.nn import init
from torch.autograd import Variable
import torch.nn.functional as F
import time
import random
from sklearn.metrics import f1_score
from collections import defaultdict
from ClfTemplates import get_all_template_path, sample_template, sample_template_path
from ClfUse import showClfResult, get_samples, labeledDataFromJson
from Ner_model import get_X_Y, lstm_crf, predict, preprocess
from Ner_data_make import create_fake_label, fake2real
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model, Input, Sequential
from keras.layers import LSTM, Embedding, SpatialDropout1D,concatenate, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras.utils import to_categorical
import keras
from collections import Iterable 
import pandas as pd
import numpy as np
import random
import re
import warnings
warnings.filterwarnings("ignore")


%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [4]:
import pandas as pd
import random

def gen_new(ori_str):
    return ''.join([str(random.randint(0,9)) if i.isdigit() else i for i in list(ori_str)])

def create_fake_label(DF, fake_times):
    DF2 = DF.copy()
    for i in range(fake_times):
        col = DF2.apply(lambda x: gen_new(x.token) if x.lable!='other' else x.token, axis=1)
        DF2['new_tokens_{}'.format(i)] = col
    return DF2

def generate_sent_id(sent_id_list, origin_sent_id):
    dic = {}
    for k in origin_sent_id:
        tmp = 'send_id_' + str(sent_id_list[-1])
        sent_id_list.pop()
        dic.setdefault(k, tmp)
    return sent_id_list, dic

def fake2real(df, fake_times):
    '''label 写错了，将错就错'''
    lis = []
    sent_id_list = list(range(len(df.sent_id.unique()) * fake_times))
    origin_sent_id = df.sent_id.unique()
    for i in range(fake_times):
        tmp_df = df[['new_tokens_{}'.format(i), 'lable', 'sent_id']]
        tmp_df.columns = ['token', 'lable', 'sent_id']
        sent_id_list, dic = generate_sent_id(sent_id_list, origin_sent_id)
        tmp_df.sent_id = tmp_df.sent_id.map(dic)
        lis.append(tmp_df)
    return pd.concat(lis)

def fake2real_v2(df, fake_times):
    lis = []
    sent_id_list = list(range(len(df.sent_id.unique()) * fake_times))
    origin_sent_id = df.sent_id.unique()
    for i in range(fake_times):
        tmp_df = df[['new_tokens_{}'.format(i), 'sent_id', 'cls']]
        tmp_df.columns = ['token', 'sent_id', 'cls']
        sent_id_list, dic = generate_sent_id(sent_id_list, origin_sent_id)
        tmp_df.sent_id = tmp_df.sent_id.map(dic)
        lis.append(tmp_df)
    return pd.concat(lis)

### 1. sms 文件

In [3]:
import pickle
import os
from IPython.display import display
import itertools
# /home/qibo/sms_nlp
res_root = '/home/qibo/sms_nlp/annotation_results/round_2_results_bank/'
# res_root = '/home/qibo/work_notebooks/sms_nlp/annotation_results/round_2_results_loan/'
ALL = []
for idx,fn in enumerate(os.listdir(res_root)):
    print(fn)
    despatcher = fn.split('.')[0]
    print(despatcher)
    with open(os.path.join(res_root, fn), 'rb') as fin:
        tasks = pickle.load(fin)

    for key, group in itertools.groupby(tasks, key=lambda x:x.output[0]):
#         print(list(group)[-2].output)
#         print('##############################')
#         print(list(group)[-1].output)
        regex_id = key
        to_display = None
        sms_type = None
        fields = []
        field_types = []
        for t in list(group):
            output = t.output
            val = t.value
            regex_id = output[0]
            group_id = output[1]
            gened_regex = output[2]
            box_markups = output[3]
            if group_id == 0:
                sms_type = val
                if sms_type:
                    print(regex_id, gened_regex)
                    print('短信类型: ', sms_type)
                to_display = box_markups[0][0]
                text = box_markups[0][0].text
#                 print('text :', text)
                spans = box_markups[0][0].spans
                for s in spans:
                    fields.append(text[s.start : s.stop])
            else:
                field_types.append(val)
        print(field_types)
        if None not in field_types:
            tem = {}
            for i , j in zip(fields, field_types):
                tem[i] = j
            new_text = 'Reg_id_' + str(regex_id) + ' ' + text
#             print(cluster_id)
            ALL.append([text, fields ,field_types, tem, sms_type])
        print('------------------------------------------------------')

indusb.pkl
indusb
3764 ^Dear Customer Txn on IndusInd Bank Credit Card no .* for INR .* on .* at .* is .* call 18602677777 for query\.Click on http\:\/\/bit\.ly\/2B8nSNl to update your Aadhaar Number\, ignore if already done\.$
短信类型:  交易流水＿转账
['银行卡号＿自己', '金额＿转出', '日期＿交易时间', '机构＿交易平台', 'other']
------------------------------------------------------
3765 ^Your IndusInd Bank A\/C .* has been debited for .* towards your Debit Card purchase\.The available Balance is .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '金额＿余额']
------------------------------------------------------
3766 ^Your a\/c no\. .* is credited by .* on .* by a\/c linked to mobile .* \(IMPS Ref no .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转入', '日期＿交易时间', '账户账号＿他人', 'other']
------------------------------------------------------
3767 ^Your IndusInd Bank A\/C .* has been debited for .* towards an ATM Cash Withdrawal Transaction\. Available Balance is .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '金额＿余额']
------------------------------------------

['金额＿转出', '机构＿交易平台', '银行卡号＿自己', 'other']
------------------------------------------------------
9354 ^Dear Customer\, your Account .* has been credited with INR .* on .* Info\: .* The Available Balance is INR .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转入', '日期＿交易时间', 'other', '金额＿余额']
------------------------------------------------------
[None, None, None, None]
------------------------------------------------------
9356 ^Dear Customer\, acct .* has been debited for .* on .* towards linked .* UPI Ref no .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', '机构＿交易平台', 'other']
------------------------------------------------------
['账户账号＿自己', '金额＿转出', None, None]
------------------------------------------------------
9358 ^Dear Customer\, acct .* is credited with .* on .* from .* UPI Ref no .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转入', '日期＿交易时间', 'other', 'other']
------------------------------------------------------
9359 ^Dear Customer\, txn of INR .* using Credit Card .* done at .* on .* Avbl Cr lmt

------------------------------------------------------
5151 ^Dear Customer\,Please make Payment of Rs\. .* due in your A\/c .* with Union Bank at Branch .* immediately\.Kindly Ignore .*$
短信类型:  信用卡＿逾期警告
['金额＿应还金额', '账户账号＿自己', 'other', 'other']
------------------------------------------------------
5153 ^Bank is replacing your existing magstripe debit card with EMV card\. Please collect your new debit card from the branch .* as old card will be blocked on .*$
短信类型:  账号异常＿卡号冻结
['other', '日期＿交易时间']
------------------------------------------------------
5154 ^Your a\/c no\. .* is debited for .* on .* and a\/c .* credited \(IMPS Ref no .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', '账户账号＿他人', 'other']
------------------------------------------------------
5156 ^Your .* A\/c .* in .* Clg\. Credited Rs\. .* on .* by Clearing\. Bal is Rs\. .* Subject To Chq .* CODE FNP15UBI FOR FERNSNPETALS$
短信类型:  交易流水＿转账
['other', '账户账号＿自己', '机构＿交易平台', '金额＿转入', '日期＿交易时间', '金额＿余额', 'other']
--------------

9947 ^Hello\! You have initiated to add .* to your account .* Please keep the txn id .* for future reference$
短信类型:  交易流水＿转账
['金额＿转入', '账户账号＿自己', 'other']
------------------------------------------------------
[None, None, None, None]
------------------------------------------------------
9949 ^Your a\/c no\. .* is debited for .* on .* and credited to .* \(UPI Ref no .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', '账户账号＿他人', 'other']
------------------------------------------------------
9953 ^SUCCESSFUL transfer of Rs .* to .* using Airtel Payments Bank transfer by \- .* ID\: .* Charges \- Max .*$
短信类型:  交易流水＿转账
['金额＿转出', '机构＿交易平台', '账户账号＿他人', 'other', 'other']
------------------------------------------------------
9957 ^Hello\! You have added .* in your Airtel Payments Bank A\/C\. Avl Bal .* Txn ID\. .*$
短信类型:  交易流水＿转账
['金额＿转入', '金额＿余额', 'other']
------------------------------------------------------
9960 ^You have paid .* using your Online Card at .* IND\. Avl Bal .* Txn ID .* Co

[None, None, None, None, None, None]
------------------------------------------------------
[None, None, None, None, None, None]
------------------------------------------------------
[None, None, None, None, None, None]
------------------------------------------------------
1806 ^Your a\/c no\. .* is debited for .* on .* and a\/c .* credited \(IMPS Ref no .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', '账户账号＿他人', 'other']
------------------------------------------------------
1807 ^Cash deposit of INR .* on .* made in A\/c .* at PMC Bank\. Clr Bal\:INR .* Eff Avail Bal\:INR .*$
短信类型:  交易流水＿转账
['金额＿转入', '日期＿交易时间', '账户账号＿自己', '金额＿信用额度', '金额＿余额']
------------------------------------------------------
1808 ^Your a\/c no\. .* is credited for .* on .* by a\/c linked to mobile .* \(IMPS Ref no .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转入', '日期＿交易时间', '账户账号＿他人', 'other']
------------------------------------------------------
1811 ^Your a\/c no\. .* is debited for .* on .* and credited to VPA .* \

6733 ^Avbl Bal in your A\/C\. .* on .* is INR .* Tot\. avbl bal \(including linked deposits and Limit\) is INR .*$
短信类型:  交易流水＿余额
['账户账号＿自己', '日期＿交易时间', '金额＿余额', '金额＿信用额度']
------------------------------------------------------
6736 ^OTP for txn of INR .* at .* on your YES BANK Debit Card ending .* is .* and valid for 5 mins\. Do not share the OTP with anyone\.$
短信类型:  交易流水＿转账
['金额＿转出', '机构＿交易平台', '银行卡号＿自己', 'other']
------------------------------------------------------
6737 ^Your a\/c no\. .* is debited for .* on .* and a\/c .* credited \(IMPS Ref no .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', '账户账号＿他人', 'other']
------------------------------------------------------
[None, None, None, None, None]
------------------------------------------------------
6739 ^Hi .* INR .* added to your YES PAY Wallet on .* through a BANK Account \(Txn Ref \- .* Balance\: INR .*$
短信类型:  交易流水＿转账
['other', '金额＿转入', '日期＿交易时间', 'other', '金额＿余额']
------------------------------------------------------


12532 ^Your a\/c .* is debited on .* by INR .* towards .* Avl .* INR .* For more details login to m\.sc\.com\/in \- StanChart$
短信类型:  交易流水＿转账
['账户账号＿自己', '日期＿交易时间', '金额＿转出', 'other', 'other', '金额＿余额']
------------------------------------------------------
12534 ^Your a\/c no\. .* is debited for Rs\. .* on .* and credited to a\/c no\. .* \(UPI Ref no .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', '账户账号＿他人', 'other']
------------------------------------------------------
12535 ^Your a\/c no\. .* is credited by INR .* on .* by a\/c linked to mobile .* \(IMPS Ref no .* Avl\. Bal\: INR .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转入', '日期＿交易时间', '账户账号＿他人', 'other', '金额＿余额']
------------------------------------------------------
12537 ^Your a\/c no\. .* is debited for INR .* on .* and a\/c .* credited \(IMPS Ref no .* \- StanChart$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', '账户账号＿他人', 'other']
------------------------------------------------------
12538 ^Your account .* has been debited on .* 

[None, None, None, None, None, None]
------------------------------------------------------
[None, None, None, None]
------------------------------------------------------
10912 ^Your A\/c .* is debited with INR .* on .* A\/c Bal is INR .* Info\: .* Call 18605005555 \(if in India\) if you have not done this transaction\.$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', '金额＿余额', 'other']
------------------------------------------------------
10913 ^Hello\! Your A\/c no\. .* has been credited with Rs\. .* on .* The A\/c balance is Rs\. .* Info\: .* Call 18605005555 \(if in India\) if you have not done this transaction\.$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转入', '日期＿交易时间', '金额＿余额', 'other']
------------------------------------------------------
10917 ^Your a\/c no\. .* is debited for .* on .* and a\/c of .* has been credited \(IMPS Ref no .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', 'other', 'other']
------------------------------------------------------
10918 ^Balance in savings a\/c .* 

[None, None, None, None, None]
------------------------------------------------------
12926 ^You have made a purchase for Rs\. .* on .* on .* using IDFC Debit Card\. New balance is .* For raising a dispute\, Please call 1800 419 4332\.$
短信类型:  交易流水＿转账
['金额＿转出', '日期＿交易时间', '机构＿交易平台', '金额＿余额']
------------------------------------------------------
12927 ^Your a\/c no\. .* is debited for .* on .* and a\/c .* credited \(IMPS Ref no .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', '账户账号＿他人', 'other']
------------------------------------------------------
12928 ^Your a\/c no\. .* is credited by .* on .* by a\/c linked to mobile .* \(IMPS Ref no .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转入', '日期＿交易时间', '账户账号＿他人', 'other']
------------------------------------------------------
[None, None, None]
------------------------------------------------------
12934 ^Beneficiary account no\. .* has been successfully credited with Rs .* Kindly note unique reference number .* for future references$
短信类型:  交易流水＿

[None, None, None, None, None]
------------------------------------------------------
3912 ^Your A\/C .* Credited INR .* on .* \-Deposit of Cash at .* ATM\. A\/c Balance INR .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转入', '日期＿交易时间', 'other', '金额＿余额']
------------------------------------------------------
3913 ^Your A\/C .* Credited INR .* on .* \-Deposit by transfer from .* A\/C Bal INR .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转入', '日期＿交易时间', 'other', '金额＿余额']
------------------------------------------------------
3914 ^Your A\/C .* has a credit by .* of Rs .* on .* Avl Bal Rs .* Download YONO \@ www\.yonosbi\.com$
短信类型:  交易流水＿转账
['账户账号＿自己', 'other', '金额＿转入', '日期＿交易时间', '金额＿余额']
------------------------------------------------------
[None, None, None, None]
------------------------------------------------------
3916 ^Your A\/C .* Debited INR .* on .* \-Transferred to INVESTMENT INTERMEDI\. A\/C Balance INR .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', '金额＿余额']
----------------------------------

5174 ^A charge of Rs\. .* has been initiated on your RBL Credit Card ending with .* at .* on .* at .* IST\. Your available limit is Rs\. .*$
短信类型:  交易流水＿转账
['金额＿转出', '银行卡号＿自己', '机构＿交易平台', '日期＿交易时间', 'other', '金额＿信用额度']
------------------------------------------------------
5176 ^INR .* debited from a\/c .* on .* by use of Card no ending .* at .* Avail Bal .* For assistance call 18001238040$
短信类型:  交易流水＿转账
['金额＿转出', '账户账号＿自己', '日期＿还款日期', '银行卡号＿自己', '机构＿交易平台', '金额＿余额']
------------------------------------------------------
5177 ^Dear .* Transfer of .* to .* has been successful with REF .*$
短信类型:  交易流水＿转账
['other', '金额＿转出', '机构＿交易平台', 'other']
------------------------------------------------------
5180 ^Your a\/c no\. .* is debited for .* on .* and credited to a\/c no\. .* \(UPI Ref no .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', '账户账号＿他人', 'other']
------------------------------------------------------
5182 ^Dear Cardmember\, payment of .* has been received towards your RBL Bank Cr

3224 ^Rs .* debited from a\/c .* on .* to VPA .* Ref No .* Not you\? Call on .* to report$
短信类型:  交易流水＿转账
['金额＿转出', '账户账号＿自己', '日期＿交易时间', 'other', 'other', 'other']
------------------------------------------------------
3226 ^Rs\. .* credited to a\/c .* on .* by a\/c linked to VPA .* \(UPI Ref No .*$
短信类型:  交易流水＿转账
['金额＿转入', '账户账号＿自己', '日期＿交易时间', 'other', 'other']
------------------------------------------------------
3229 ^Rs\. .* debited from a\/c .* on .* to a\/c .* \(UPI Ref No\. .* Not you\? Call on .* to report$
短信类型:  交易流水＿转账
['金额＿转出', '账户账号＿自己', '日期＿交易时间', '账户账号＿他人', 'other', 'other']
------------------------------------------------------
3231 ^.* was withdrawn using your HDFC Bank Card ending .* on .* at .* Avl bal\: .*$
短信类型:  交易流水＿转账
['金额＿转出', '银行卡号＿自己', '日期＿交易时间', '机构＿交易平台', '金额＿余额']
------------------------------------------------------
3232 ^.* was spent on ur HDFCBank CREDIT Card ending .* on .* at .* bal \- .* curr o\/s \- .*$
短信类型:  交易流水＿转账
['金额＿转出', '银行卡号＿自己', '日期＿交易时

143 ^.* is your SECRET One Time Password \(OTP\) for payment of Rs\. .* to .* via NetBanking\. Do not share it with anyone\.$
短信类型:  交易流水＿转账
['other', '金额＿转出', '机构＿交易平台']
------------------------------------------------------
144 ^.* is your SECRET One Time Password \(OTP\) to transfer Rs\. .* from A\/c ending in .* to .* Ref .* Do not share it with anyone\.$
短信类型:  交易流水＿转账
['other', '金额＿转出', '银行卡号＿自己', 'other', 'other']
------------------------------------------------------
145 ^.* was spent on ur HDFCBank CREDIT Card ending .* on .* at .* bal \- .* curr o\/s \- .*$
短信类型:  交易流水＿转账
['金额＿转出', '银行卡号＿自己', '日期＿交易时间', 'other', '金额＿余额', 'other']
------------------------------------------------------
147 ^Rs\. .* debited from a\/c .* on .* to a\/c .* \(UPI Ref No\. .* Not you\? Call on 18002586161 to report$
短信类型:  交易流水＿转账
['金额＿转出', '账户账号＿自己', '日期＿交易时间', '账户账号＿他人', 'other']
------------------------------------------------------
148 ^.* was withdrawn using your HDFC Bank Card ending .* on .* a

['账户账号＿自己', None, None, None, None, None]
------------------------------------------------------
[None, None, None, None, None]
------------------------------------------------------
4026 ^Your A\/c No .* has been debited by Rs\. .* on .* via .* A\/c No .* Bal is Rs\. .* CR and AVL Bal is Rs\. .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', 'other', 'other', '金额＿余额', '金额＿信用额度']
------------------------------------------------------
4028 ^Your A\/c\. .* is credited by Rs\. .* on .* by A\/c linked to mobile .* \( IMPS Ref no\. .* \)\.$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转入', '日期＿交易时间', '账户账号＿他人', 'other']
------------------------------------------------------
4030 ^Your A\/c No .* has been credited by .* on .* by IMPS .* from .* A\/c Bal is Rs\. .* CR and AVL Bal is Rs\. .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转入', '日期＿交易时间', 'other', '机构＿交易平台', '金额＿余额', '金额＿信用额度']
------------------------------------------------------
4032 ^Your A\/C .* Balance is\: .* CR\;Available Balance is\: .*$
短信类型:  交易

[None, None, None, None, None]
------------------------------------------------------
[None, None, None, None, None, None, None]
------------------------------------------------------
5801 ^Your a\/c no .* is debited for .* on .* and a\/c .* of .* is credited \(IMPS Ref no .* \)\.$
短信类型:  交易流水＿转账
['账户账号＿自己', '金额＿转出', '日期＿交易时间', '账户账号＿他人', 'other', 'other']
------------------------------------------------------
5803 ^Dear Customer\, Your request for IMPS payment of Rs\. .* for the beneficiary IFSC .* a\/c .* has been .* From CorpBank$
短信类型:  交易流水＿转账
['金额＿转出', 'other', '账户账号＿自己', 'other']
------------------------------------------------------
5805 ^Your A\/c .* in .* credited INR .* on .* by CASH\. Avl Bal is .*$
短信类型:  交易流水＿转账
['账户账号＿自己', '机构＿交易平台', '金额＿转入', '日期＿交易时间', '金额＿余额']
------------------------------------------------------
[None, None, None, None, None]
------------------------------------------------------
5814 ^Your A\/c .* in .* debited INR .* on .* by CASH\. Avl Bal is .*$


In [4]:
DF = pd.DataFrame()
for tt in ALL:
    DF_tmp = pd.DataFrame()
    length = len(tt[0].split(' '))
    DF_tmp['token'] = tt[0].split(' ')
    DF_tmp['lable'] = DF_tmp.token.apply(lambda x: tt[3][x] if x in tt[3] else np.nan)
    DF_tmp['sent_id'] = [tt[0].split(' ')[i] if i==0 else np.nan for i in range(len(DF_tmp))]
    DF_tmp['cls'] = [tt[4]]*length
    DF_tmp.sent_id=DF_tmp.sent_id.fillna(method='ffill')
    DF = DF.append(DF_tmp)
    DF = DF[~DF.cls.isnull()]
DF = DF.fillna(value='other')

DF['sent_id'] = ['sent_' if i == 0 else np.nan for i in DF.index]
DF['idx_num'] = range(DF.shape[0])
DF['sent_id'] = DF.apply(lambda x: str(x.sent_id) + str(x.idx_num), axis=1)
DF['sent_id'] = DF.sent_id.apply(lambda x: x if x.startswith('sent') else np.nan)
DF.sent_id = DF.sent_id.fillna(method='ffill')
DF = DF.drop(['idx_num'], axis=1)
DF = DF[DF.cls != False]
DF.shape

(29300, 4)

### 唯一测试集；

In [5]:
Test_DF = pd.read_csv('gsmTst100.csv', header=None).iloc[:,1:]
Test_DF.columns = ['token', 'lable', 'sent_id', 'cls']

sms_te = Test_DF.groupby('sent_id').apply(lambda x: ' '.join(x.token.tolist()).lower())
cls_te = Test_DF.groupby('sent_id').apply(lambda x: x.cls.tolist()[0])
df3_te = pd.DataFrame(columns=['sms', 'cls'])
df3_te.sms = sms_te
df3_te.cls = cls_te

df3_te.shape


(100, 2)

In [6]:
DF = DF[~DF.sent_id.isin(Test_DF.sent_id)]
DF.shape

(26883, 4)

In [7]:
## 造假n倍
dupli = 10
DF = DF[DF.cls != False]
df = create_fake_label(DF, dupli)
df2 = fake2real_v2(df, dupli)

In [8]:
# 过采样欠采样；2bDone

In [9]:
def get_cos_similarity(sms, templates):
    '''计算一条新sms 与 每个 template 相似度;'''
    def cos_sim(a, b):
        return dot(a, b) / (norm(a) * norm(b))
    return [cos_sim(i, sms) for i in templates]


def tfIdfVector(corpus):
    '''
    corpus is a list of sentences:
    ['This is an example', 'hello world', ...]
    '''
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    x = vectorizer.fit_transform(corpus)
    tfidf = transformer.fit_transform(x)
    return tfidf.toarray()

# def newAndTemplate(template_df, new_sms, thresh=None):
#     '''
#     new_sms: test samples
#     '''
#     corpus, labels = template_df.sms.tolist(), template_df.label.tolist()
#     num_sms = len(new_sms)
#     for single in new_sms:
#         corpus.append(single.lower())
#     all_tfidf = tfIdfVector(corpus)
#     template_tfidf = all_tfidf[:-num_sms]
#     instances_tfidf = all_tfidf[-num_sms:]
#     adj_mat = cosine_similarity(all_tfidf)
#     print('adj mat shape:{}'.format(adj_mat.shape))
#     if thresh:
#         print('thresh:{}'.format(thresh))
#         adj_mat-=np.identity(len(adj_mat))
#         G2 = nx.from_numpy_matrix((adj_mat > thresh)*1)
#     else:
#         G2 = nx.from_numpy_matrix(adj_mat)
#     print(G2.number_of_edges())
#     for i in range(len(G2.nodes())):
#         G2.node[i]['vec'] =  all_tfidf[i]
#         G2.node[i]['text'] = corpus[i]
#         temp_label = labels[i] if i < len(labels) else 0
#         G2.node[i]['label'] = temp_label
#     return G2

def GraphBuild(template_df, new_sms, K=10, thresh=None):
    '''
    之前版本没考虑到采样需要全图遍历；故在入图过程就把K_nerghs当作点属性加进去；
    new_sms: test samples, df type;
    
    '''
    def findNeighsK(arr, K):
        K+=1 #入图时neighs 不包括自己，故这里先加一个保证去除自己之后仍然有K neighs;
        idx=np.argpartition(arr, -K)[-K:]
        return idx

    corpus, labels = template_df.sms.tolist(), template_df.cls.tolist()
    new_labels = new_sms.cls.tolist()
    new_sms = new_sms.sms.tolist()
    num_sms = len(new_sms)
    for single in new_sms:
        corpus.append(single.lower())
    for single in new_labels:
        labels.append(single)
    all_tfidf = tfIdfVector(corpus)
    adj_mat = cosine_similarity(all_tfidf)
    print('adj mat shape:{}'.format(adj_mat.shape))
    G = nx.Graph()
    G.add_nodes_from(range(len(adj_mat)))
    for i in range(len(G.nodes())):
        G.node[i]['vec'] =  all_tfidf[i]
        G.node[i]['text'] = corpus[i]
        G.node[i]['label'] = labels[i]
        G.node[i]['neighs_k'] = set(findNeighsK(adj_mat[i], K)) - set([i])
    return G

In [10]:
# df2 = DF

In [11]:
sms = df2.groupby('sent_id').apply(lambda x: ' '.join(x.token.tolist()).lower())
cls = df2.groupby('sent_id').apply(lambda x: x.cls[0])
df3 = pd.DataFrame(columns=['sms', 'cls'])
df3.sms = sms
df3.cls = cls

In [12]:
g4 = GraphBuild(df3, df3_te)

adj mat shape:(11050, 11050)


In [2]:
"""
Set of modules for aggregating embeddings of neighbors.
"""
class MeanAggregator_QBv2(nn.Module):
    """
    adjMat@embMat: [bs, uniqueNode]@[uniqueNode, F]=[bs, F]
    
    要改，之前没注意每次采样全图遍历；
    如无必要，不要让一个实例挂一个毒瘤: G (20G内存)
    """
    def __init__(self, features, cuda=False, gcn=False): 
        """
        Initializes the aggregator for a specific graph.
        features -- function mapping LongTensor of node ids to FloatTensor of feature values.
        cuda -- whether to use GPU;
        gcn --- whether to perform concatenation GraphSAGE-style, or add self-loops GCN-style
        """
        super(MeanAggregator_QBv2, self).__init__()
        self.features = features
        self.cuda = cuda
        self.gcn = gcn
        
    def forward(self, nodes, to_neighs, num_sample=10):
        """
        nodes --- list of nodes in a batch, [bs]
        to_neighs --- list of sets, 
                      each set is the set of neighbors for node in batch
        num_sample --- number of neighbors to sample. 
        """
        _set = set
        if not num_sample is None:
            _sample = random.sample
            samp_neighs = [_set(_sample(to_neigh, num_sample))
                           if len(to_neigh) >= num_sample else to_neigh 
                           for to_neigh in to_neighs]
        else:
            samp_neighs = to_neighs
    
        # 纳入target idx;
        if self.gcn:
            samp_neighs = [set.union(samp_neigh, set([nodes[i]])) for i, samp_neigh in enumerate(samp_neighs)]
        unique_nodes_list = list(set.union(*samp_neighs))
        unique_nodes = {n:i for i,n in enumerate(unique_nodes_list)}
        mask = Variable(torch.zeros(len(samp_neighs), len(unique_nodes)))
        column_indices = [unique_nodes[n] for samp_neigh in samp_neighs for n in samp_neigh]   
        row_indices = [i for i in range(len(samp_neighs)) for j in range(len(samp_neighs[i]))]
        mask[row_indices, column_indices] = 1
        if self.cuda:
            mask = mask.cuda()
        num_neigh = mask.sum(1, keepdim=True)
        mask = mask.div(num_neigh)
        if self.cuda:
            embed_matrix = self.features(torch.LongTensor(unique_nodes_list).cuda())
        else:
            embed_matrix = self.features(torch.LongTensor(unique_nodes_list))
        to_feats = mask.mm(embed_matrix)
        return to_feats

    
class Encoder_QBv2(nn.Module):
    """
    ([cls, 2f]@[2f, bs]).T = [bs, cls] 
    Encodes a node's using 'convolutional' GraphSage approach
    """
    def __init__(self, features, feature_dim, 
            embed_dim, adj_lists, aggregator,
            num_sample=10,
            base_model=None, gcn=False, cuda=False, 
            feature_transform=False): 
        super(Encoder_QBv2, self).__init__()

        self.features = features
        self.feat_dim = feature_dim
        self.adj_lists = adj_lists
        self.aggregator = aggregator
        self.num_sample = num_sample
        if base_model != None:
            self.base_model = base_model

        self.gcn = gcn
        self.embed_dim = embed_dim
        self.cuda = cuda
        self.aggregator.cuda = cuda
        self.weight = nn.Parameter(
                torch.FloatTensor(embed_dim, self.feat_dim if self.gcn else 2 * self.feat_dim))
        init.xavier_uniform(self.weight)


    def forward(self, nodes):
        """
        Generates embeddings for a batch of nodes.
        nodes     -- list of nodes; [bs]
        operations: 已知self_feat: [bs, F]
                    再agg_feat: [bs, F]
                    然后concat([agg_feat, self_feat]): [bs, 2F]
                    然后W:[F2, 2F] 
                    relu(W @[bs,2F].T) : [F2, bs] 
        """
        neigh_feats = self.aggregator.forward(nodes, [self.adj_lists[int(node)] for node in nodes], 
                self.num_sample)
        if not self.gcn:
            if self.cuda:
                self_feats = self.features(torch.LongTensor(nodes).cuda())
            else:
                self_feats = self.features(torch.LongTensor(nodes))
            combined = torch.cat([self_feats, neigh_feats], dim=1)
        else:
            combined = neigh_feats
        combined = F.relu(self.weight.mm(combined.t()))
        return combined



class SupervisedGraphSage_QB(nn.Module):
    def __init__(self, num_classes, enc):
        super(SupervisedGraphSage_QB, self).__init__()
        self.enc = enc
        self.weight = nn.Parameter(torch.FloatTensor(num_classes, enc.embed_dim))
        init.xavier_uniform(self.weight)
        self.loss = nn.NLLLoss(reduce=False)
        self.logsoftmax = nn.LogSoftmax(dim=-1)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, nodes, softmax=False):
        embeds = self.enc(nodes)
        scores = self.weight.mm(embeds)
        if softmax:
            return self.softmax(scores.t())
        return scores.t()

    def loss_softmax_mask(self, nodes, labels, mask=None):
        '''
        nodes shape: [N];
        labels shape: [N];
        mask shape: [N];
        '''
        if mask is None:
            mask = torch.ones(len(labels))
        logits = self.forward(nodes)
        A_soft = self.logsoftmax(logits)
        output = self.loss(A_soft, labels.squeeze())
        loss = output*mask.type(torch.float)
        loss = sum(loss)/sum(mask)
        return loss
    
    
def result(G, output, label_idx, idx2label):
    score, pred = output.max(dim=-1)
    sms = [G.node[i]['text'] for i in label_idx]
    df = pd.DataFrame()
    df['sms']= sms
    df['score']=score.data
    df['pred']=[idx2label[i] for i in pred.numpy()]
    df['label']=[g4.node[i]['label'] for i in label_idx]
    return df

NameError: name 'nn' is not defined

In [42]:
def load_india_sms(G):
    feat_data = [G.node[i]['vec'] for i in G.nodes]
    labels = [G.node[i]['label'] for i in G.nodes]
    adj_lists = [G.node[i]['neighs_k'] for i in G.nodes]
    return feat_data, labels, adj_lists

In [43]:
len(feat_data)

11050

In [44]:
np.random.seed(1)
random.seed(1)
feat_data, labels, adj_lists = load_india_sms(g4)
idx2label = {i:j for i, j in enumerate(np.unique(labels))}
label2idx = {j:i for i,j in idx2label.items()}
labels = np.array([label2idx[i] for i in labels])
num_cls = len(np.unique(labels))
features = nn.Embedding(len(feat_data), len(feat_data[0]))
features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False)

agg1 = MeanAggregator_QBv2(features, cuda=False, gcn=True)
enc1 = Encoder_QBv2(features, node_F, 128, adj_lists, agg1, num_sample=5, gcn=True, cuda=False)
agg2 = MeanAggregator_QBv2(lambda nodes : enc1(nodes).t(), cuda=False, gcn=True)
enc2 = Encoder_QBv2(lambda nodes : enc1(nodes).t(), 
               enc1.embed_dim, node_F, adj_lists, agg2, num_sample=5, gcn=True, cuda=False)

In [63]:
graphsage = SupervisedGraphSage_QB(num_cls, enc2)
#graphsage.cuda()
#rand_indices = np.random.permutation(len(node_list))
nodes = list(g4.nodes)
val = nodes[:50]
test_nodes = list(g4.nodes)[-df3_te.shape[0]:]
train = nodes[50:-len(test_nodes)]
optimizer = torch.optim.SGD(filter(lambda p : p.requires_grad, graphsage.parameters()), lr = 0.05)
times = []
Loss = []
print('GO')
for batch in range(4000):
    batch_nodes2 = train[:100]
    batch_idx = copy.copy(train[:100])
    random.shuffle(train)
    start_time = time.time()
    optimizer.zero_grad()
    loss = graphsage.loss_softmax_mask(batch_nodes2,
            Variable(torch.tensor(labels[np.array(batch_idx)])))
    loss.backward()
    optimizer.step()
    end_time = time.time()
    times.append(end_time - start_time)
    if batch%100 == 0:
        train_loss = graphsage.loss_softmax_mask(train,
            Variable(torch.tensor(labels[np.array(train)])))
        val_loss = graphsage.loss_softmax_mask(val,
            Variable(torch.tensor(labels[np.array(val)])))
        print('batch:{}; train_loss:{}; val_loss:{}'.format(batch, train_loss.data, val_loss.data))
    Loss.append(loss.data)

GO
batch:0; train_loss:2.8286921977996826; val_loss:2.8306429386138916
batch:100; train_loss:2.155996561050415; val_loss:2.4689207077026367
batch:200; train_loss:1.787720799446106; val_loss:2.303647518157959
batch:300; train_loss:1.6322472095489502; val_loss:2.2604198455810547
batch:400; train_loss:1.5698705911636353; val_loss:2.2569632530212402
batch:500; train_loss:1.537320852279663; val_loss:2.2695255279541016
batch:600; train_loss:1.5167006254196167; val_loss:2.2765228748321533
batch:700; train_loss:1.5000903606414795; val_loss:2.258310556411743
batch:800; train_loss:1.4885119199752808; val_loss:2.243403434753418
batch:900; train_loss:1.4737707376480103; val_loss:2.2367095947265625
batch:1000; train_loss:1.4626046419143677; val_loss:2.2238497734069824
batch:1100; train_loss:1.4515964984893799; val_loss:2.19577693939209
batch:1200; train_loss:1.440972924232483; val_loss:2.2087111473083496
batch:1300; train_loss:1.4311469793319702; val_loss:2.183877944946289
batch:1400; train_loss:1.

In [64]:
val_output = graphsage.forward(val)

result(g4, val_output, val, idx2label)

Unnamed: 0,sms,score,pred,label
0,"dear customer,in order to serve you better, ax...",3.77726,交易流水＿转账,交易流水＿余额
1,you have done a transaction through iob intern...,3.523083,交易流水＿转账,交易流水＿转账
2,"due, rs.0643 towards your loan a/c x..x6323 in...",2.223659,交易流水＿转账,贷后提醒＿到期提醒
3,a charge of rs. 340.22 has been initiated on y...,4.740955,交易流水＿转账,交易流水＿转账
4,"dear customer, you have made a payment of rs. ...",4.337186,交易流水＿转账,交易流水＿转账
5,"rs. 183.02 spent on card xx2186 on 00-40-9641,...",4.178057,交易流水＿转账,交易流水＿转账
6,namaskar! your transaction has been declined d...,3.257231,交易流水＿转账,账号异常＿余额不足
7,rs. 533.69 was spent on your debit card 7073xx...,3.464867,交易流水＿转账,交易流水＿转账
8,your a/c no. xxxxxxxxxx7910 is debited for rs....,4.715461,交易流水＿转账,交易流水＿转账
9,"rs. 367.30 spent on card xx0956 on 34-89-9890,...",3.652866,交易流水＿转账,交易流水＿转账


In [65]:
te_output = graphsage.forward(test_nodes)
result(g4, te_output, test_nodes, idx2label)

Unnamed: 0,sms,score,pred,label
0,thank you for payment of inr.6000.00 on your c...,4.708610,交易流水＿转账,交易流水＿转账
1,"dear customer,your transaction at atm 1cn04401...",3.009799,交易流水＿转账,账号异常＿余额不足
2,"your a/c no. xxxxxxx7494 is debited for inr 2,...",3.527857,交易流水＿转账,交易流水＿转账
3,your a/c xxxxxxx5273 has been debited with inr...,3.611550,交易流水＿转账,交易流水＿转账
4,your a/c no. ***********5885 is credited for r...,4.313889,交易流水＿转账,交易流水＿转账
5,"inr 9,400.00 credited to your a/c no xxxxxxx49...",3.559350,交易流水＿转账,交易流水＿转账
6,"dear boi customer, a/c xxxxxxxxxxx6116 charged...",3.883627,交易流水＿转账,交易流水＿转账
7,rs.30 has been debited from your kotak account...,4.075966,交易流水＿转账,交易流水＿转账
8,a/c nn4214 debited for inr 24; atm wdl. a/c ba...,4.320962,交易流水＿转账,交易流水＿转账
9,"dear customer, refund of rs .82 from hpcl 0.75...",2.716831,交易流水＿转账,交易流水＿转账


In [66]:
tra_output = graphsage.forward(train)
result(g4, tra_output, train, idx2label)

Unnamed: 0,sms,score,pred,label
0,it's done! we've transfered rs.2 from your acc...,2.704044,交易流水＿转账,交易流水＿转账
1,payment overdue on your credit card xx4129 is ...,1.810211,信用卡＿还款提醒,信用卡＿还款提醒
2,txn of rs.966.19 made on sbi card xx5453 at am...,1.876180,交易流水＿转账,账号异常＿扣款失败
3,"dear customer, your transaction decline for de...",2.855015,交易流水＿转账,账号异常＿扣款失败
4,your a/c no. xxxx8393 is debited for rs.1.83 o...,4.074343,交易流水＿转账,交易流水＿转账
5,"dear hdfc bank card member, rs 6.39 has been c...",3.999855,交易流水＿转账,交易流水＿转账
6,"dear customer, total amount of inr 18120.80 an...",1.551628,信用卡＿还款提醒,信用卡＿还款提醒
7,"top up of rs.18,503.83 is successfully receive...",5.671163,交易流水＿转账,交易流水＿转账
8,"dear customer, transaction of rs.282.2 from an...",5.043914,交易流水＿转账,交易流水＿转账
9,your a/c xxxx8264 has been credited by rs. 9.5...,4.644256,交易流水＿转账,交易流水＿转账


In [None]:
# valid 
Res = pd.DataFrame(columns=['true', 'pred'])

true, pred = get_clf_prediction(model, X_word_val, X_char_val, y_val)

Res['true'] = true
Res['pred'] = pred

entitys = Res.true.unique()
records = []
for i in entitys:
    tmp = evaluate(Res, i)
    records.append(tmp)
record = pd.DataFrame.from_records(records)
record.columns =['cls','精确率','召回率', 'F1', 'support']
record = record.set_index('cls')
record = record.sort_index()
record

In [None]:
grouped_te = Test_DF.groupby('sent_id').apply(lambda x: [(w.lower(), t) for w,t in zip(x.token.tolist(), x.lable.tolist())])
sentences_te = [s for s in grouped_te]

X_word_te = [[word2idx.get(w[0], word2idx['UNK']) for w in s] for s in sentences_te]
X_word_te = pad_sequences(maxlen=max_len, sequences=X_word_te, value=word2idx['PAD'],padding='post',truncating='post')

X_char_te = get_X_char(sentences_te)
y_te = [[tag2idx[w[1]] for w in s] for s in sentences_te]
y_te = pad_sequences(maxlen=max_len, sequences=y_te, value=tag2idx['other'],padding='post',truncating='post')


In [None]:
# test
Res = pd.DataFrame(columns=['true', 'pred'])
true, pred = get_prediction(model, X_word_te, X_char_te, y_te)
Res['true'] = true
Res['pred'] = pred

entitys = Res.true.unique()
records = []
for i in entitys:
    tmp = evaluate(Res, i)
    records.append(tmp)
record = pd.DataFrame.from_records(records)
record.columns =['cls','精确率','召回率', 'F1','support']
record = record.set_index('cls')
record = record.sort_index()
record

In [None]:
grouped_te = Test_DF.groupby('sent_id').apply(lambda x: [(w.lower(), t) for w,t in zip(x.token.tolist(), x.lable.tolist())])
sentences_te = [s for s in grouped_te]

X_word_te = [[word2idx.get(w[0], word2idx['UNK']) for w in s] for s in sentences_te]
X_word_te = pad_sequences(maxlen=max_len, sequences=X_word_te, value=word2idx['PAD'],padding='post',truncating='post')

X_char_te = get_X_char(sentences_te)
y_te = [[tag2idx[w[1]] for w in s] for s in sentences_te]
y_te = pad_sequences(maxlen=max_len, sequences=y_te, value=tag2idx['other'],padding='post',truncating='post')


In [None]:
gsm_path = 'gsm_templates_df.csv'
gsm_templates_df = pd.read_csv(gsm_path)
gsm_templates_df = gsm_templates_df[~gsm_templates_df.label.isnull()]
print(gsm_templates_df.shape)

In [None]:
test_100example = gsm_templates_df.sample(n=100)
gsm_templates_df = gsm_templates_df[~gsm_templates_df.index.isin(test_100example.index)]
print(gsm_templates_df.shape)

In [24]:
def result(G, output, label_idx, idx2label):
    score, pred = output.max(dim=-1)
    sms = [G.node[i]['text'] for i in label_idx]
    df = pd.DataFrame()
    df['sms']= sms
    df['score']=score.data
    df['pred']=[idx2label[i] for i in pred.numpy()]
    df['label']=[g4.node[i]['label'] for i in label_idx]
    return df

In [None]:
result(g4, val_output, val, idx2label)

In [None]:
idx2label[np.argmax(graphsage.forward([3]).squeeze(0).data.numpy())]

In [None]:
def get_clf_prediction(model, X_word, X_char, y):
    print('X shape:{}, y_shape:{}'.format(len(X_word), len(y)))
    y_pred = model.predict([X_word,
                            np.array(X_char).reshape((len(X_char),
                                                         max_len, max_len_char))])
    pred = [idx2label[i] for i in np.argmax(y_pred,axis=-1)]
    true = [idx2label[i] for i in y]
    return true, pred

def get_pr(Res, cls):
    #某类 正确识别数量/该类 总识别数量
    tmp  = Res[Res.true == cls]
    a = sum(tmp.true.values == tmp.pred.values)
    b = Res[Res.pred == cls].shape[0]
    pr = a/b
    return pr

def get_rc(Res, cls):
    #某类 正确识别数量/该类 总数量
    tmp  = Res[Res.true == cls]
    support = tmp.shape[0]
    a = sum(tmp.true.values == tmp.pred.values)
    b = Res[Res.true == cls].shape[0]
    rc = a/b
    return rc,support

def get_f1(pr, rc):
    # f1 = (2*pr*rc)/(pr+rc)
    f1 = (2*pr*rc)/(pr+rc)
    return f1

def evaluate(Res, cls):
    pr = get_pr(Res, cls)
    rc, support = get_rc(Res, cls)
    f1 = get_f1(pr, rc)
    return [cls, pr, rc, f1, support]
