In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn as nn
import copy
# from graphviz import Digraph
from torch.nn import init
from torch.autograd import Variable
import torch.nn.functional as F

import time
import random
from sklearn.metrics import f1_score
from collections import defaultdict
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

### 1. sms 文件

In [3]:
gsm_path = 'gsm_templates_df.csv'
gsm_templates_df = pd.read_csv(gsm_path)
gsm_templates_df = gsm_templates_df[~gsm_templates_df.label.isnull()]
print(gsm_templates_df.shape)

(1195, 2)


In [4]:
test_100example = gsm_templates_df.sample(n=100)
gsm_templates_df = gsm_templates_df[~gsm_templates_df.index.isin(test_100example.index)]
print(gsm_templates_df.shape)

(1095, 2)


In [5]:
def get_cos_similarity(sms, templates):
    '''计算一条新sms 与 每个template 相似度;'''
    def cos_sim(a, b):
        return dot(a, b) / (norm(a) * norm(b))
    return [cos_sim(i, sms) for i in templates]


def tfIdfVector(corpus):
    '''
    corpus is a list of sentences:
    ['This is an example', 'hello world', ...]
    '''
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    x = vectorizer.fit_transform(corpus)
    tfidf = transformer.fit_transform(x)
    return tfidf.toarray()

def newAndTemplate(template_df, new_sms, thresh=None):
    '''
    new_sms: test samples
    '''
    corpus, labels = template_df.sms.tolist(), template_df.label.tolist()
    num_sms = len(new_sms)
    for single in new_sms:
        corpus.append(single.lower())
    all_tfidf = tfIdfVector(corpus)
    template_tfidf = all_tfidf[:-num_sms]
    instances_tfidf = all_tfidf[-num_sms:]
    adj_mat = cosine_similarity(all_tfidf)
    print('adj mat shape:{}'.format(adj_mat.shape))
    if thresh:
        print('thresh:{}'.format(thresh))
        adj_mat-=np.identity(len(adj_mat))
        G2 = nx.from_numpy_matrix((adj_mat > thresh)*1)
    else:
        G2 = nx.from_numpy_matrix(adj_mat)
    print(G2.number_of_edges())
    for i in range(len(G2.nodes())):
        G2.node[i]['vec'] =  all_tfidf[i]
        G2.node[i]['text'] = corpus[i]
#         temp_label = labels[i] if i < len(labels) else 0
        temp_label = labels[i]
        G2.node[i]['label'] = temp_label
    return G2


In [6]:
def sampleNeighs4Node_i(G, i, num):
    '''
    G: graph
    i: node_idx.
    '''
    z = [nbrdict for n, nbrdict in G.adjacency()]
    sorted_weight = sorted(z[i].items(), key=lambda x: x[1]['weight'],reverse=True)
    return sorted_weight[1:1+num]

def getBatchAdjMat1(G, batchNodeIdx,num):
    ''' 
    原文中为了保证同一个batch shape 一致性;
    同时为了 A@emb 同时纳入(结构特征 与 点特征);
    把列映射为batch level的unique nodes;
    这样让A得以成为adj 矩阵；
    我这里如果同样处理方式，则有可能将adj[i][i] 带进来；无法避免这种可能性；
    
    可以解决； mask 元素相乘；
    batchNodeIdx: each batch like:[1,2,3,4,5,...]
    G:graph
    '''
    sample = [i[0] for nodeid in batchNodeIdx
                for i in sampleNeighs4Node_i(G, nodeid, num)]
    cols = np.unique(sample)
    mask = np.zeros((len(batchNodeIdx), len(cols)))
    print('mask shape:{}'.format(mask.shape))
    for idx, i in enumerate(batchNodeIdx):
        for jdx, j in enumerate(cols):
            mask[idx][jdx] = G.adj[i][j]['weight']
    normalized = np.array(mask)/np.array(mask).sum(axis=-1)[:,np.newaxis]
    return mask, normalized

def getBatchAdjMat2(G, batchNodeIdx, num):
    ''' 
    可以保证我的batch shape 永远一致，我不受采样约束，
    因为是按weight rank 采样;
    batchNodeIdx: each batch like:[1,2,3,4,5,...]
    G: graph;
    '''
    sample = [[i[1]['weight'] for i in sampleNeighs4Node_i(G, nodeid, num)]
              for nodeid in batchNodeIdx]
    normalized = np.array(sample)/np.array(sample).sum(axis=-1)[:,np.newaxis]
    return np.array(sample), normalized


def mask_adj(G, batchNodeIdx, num):
    # IO: G, batchNodeIdx,num
    sample2 = [[i[0] for i in sampleNeighs4Node_i(G, nodeid, num)]
              for nodeid in batchNodeIdx]
    sample = [i[0] for nodeid in batchNodeIdx
                for i in sampleNeighs4Node_i(G, nodeid, num)]
    unique_nodes = np.unique(sample)
    node2idx ={i:j for j,i in enumerate(unique_nodes)}
    mask = np.zeros((len(batchNodeIdx), len(unique_nodes)))
    # IO: sample2, idx2node, mask
    rows_ = [ii for ii in range(len(sample2)) for j in range(len(sample2[ii]))]
    cols_ = [node2idx[j] for i in sample2 for j in i]
    mask[rows_,cols_] = 1
    return mask, unique_nodes


In [7]:
g4 = newAndTemplate(template_df=gsm_templates_df, new_sms=test_100example.sms.tolist())
test_nodes = [i for i in g4.nodes if g4.node[i]['label']==0]


adj mat shape:(1195, 1195)
701206


IndexError: list index out of range

In [8]:
mask_adj(g4, [1,2,3,4], 5)

(array([[0., 1., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1.,
         1., 0., 0.],
        [1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0.,
         0., 1., 0.]]),
 array([   1,    3,    5,   11,   15,   16,   20,   33,   42,   45,  298,
         410,  598,  956,  974,  986, 1086, 1105, 1154]))

In [9]:
# 可以解决，用 mask_adj 套路  mask矩阵与该矩阵元素相乘；
getBatchAdjMat1(g4, [1,2,3], 5)

mask shape:(3, 14)


(array([[1.        , 0.30501109, 0.25382457, 0.11051124, 0.27134478,
         0.09259574, 0.29047607, 0.35535139, 0.05127082, 0.05315911,
         0.04570996, 0.04570996, 0.05392012, 0.17213111],
        [0.05389052, 0.1090468 , 0.05044519, 0.0563651 , 0.1193469 ,
         0.02486161, 0.047289  , 0.10650981, 0.4019629 , 0.3875225 ,
         0.39612425, 0.39612425, 0.3930702 , 0.15175857],
        [0.30501109, 1.        , 0.1093219 , 0.31804936, 0.2633035 ,
         0.27412254, 0.23409717, 0.29313681, 0.05566361, 0.14305689,
         0.05037265, 0.05037265, 0.0585399 , 0.53240805]]),
 array([[0.32247496, 0.09835844, 0.08185207, 0.03563711, 0.0875019 ,
         0.02985981, 0.09367126, 0.11459193, 0.01653355, 0.01714248,
         0.01474032, 0.01474032, 0.01738789, 0.05550797],
        [0.02000155, 0.04047288, 0.01872281, 0.02091999, 0.04429578,
         0.00922742, 0.01755138, 0.03953128, 0.14918913, 0.14382955,
         0.1470221 , 0.1470221 , 0.14588859, 0.05632542],
        [0.0827158

In [10]:
getBatchAdjMat2(g4, [1,2], 5)

(array([[0.35535139, 0.30501109, 0.29047607, 0.27134478, 0.25382457],
        [0.4019629 , 0.39612425, 0.39612425, 0.3930702 , 0.3875225 ]]),
 array([[0.24075169, 0.20664597, 0.19679845, 0.18383694, 0.17196695],
        [0.20354571, 0.20058914, 0.20058914, 0.19904263, 0.19623339]]))

In [63]:
"""
Set of modules for aggregating embeddings of neighbors.
"""
class MeanAggregator_QB(nn.Module):
    """
    # 要改， 采样的时候，按照 weight batch 采样 top_N； 
    # mask = [N, Unique_batch_nodes] entrys are weight again.
    # 然后归一化mask@emb_mat = 【N, F】
    # 然后【N,F】@[F,num_cls] = [N,num_cls]
    """
    def __init__(self, G, features, cuda=False, gcn=False): 
        """
        Initializes the aggregator for a specific graph.

        features -- function mapping LongTensor of node ids to FloatTensor of feature values.
        cuda -- whether to use GPU
        gcn --- whether to perform concatenation GraphSAGE-style, or add self-loops GCN-style
        """

        super(MeanAggregator_QB, self).__init__()
        self.features = features
        self.cuda = cuda
        self.gcn = gcn
        self.G = G
        
    def forward(self, nodes, num_sample=10):
        """
        nodes --- list of nodes in a batch, [bs]
        num_sample --- number of neighbors to sample. 
        """
        # 三种mask_adj;
        # 第一种是做unique col map, 会部分引入sim(i,i)=1, 
        # 没有关系原文也会隐式引入;
        ############################################
#         _, norm_adj = getBatchAdjMat2(self.G, nodes, num_sample)
#         mask = torch.from_numpy(norm_adj).type(torch.float)
        
        # 第二种是类比原文，adj@emb = [bs, F];
        ############################################
        mask, unique_nodes_list = mask_adj(self.G, nodes, num_sample)
        mask = torch.from_numpy(mask).type(torch.float)
        
        # 第三种是不做unique映射；能做 adj@emb 吗？
        # 不能，因为adj 的特定含义（点，点关系）无法表达。
        # unique 映射 是必须的。
        ##########################################
#         _, norm_adj = getBatchAdjMat1(self.G, nodes, num_sample)
#         mask = torch.from_numpy(norm_adj).type(torch.float)
        
        if self.cuda:
            mask = mask.cuda()
        if self.cuda:
            embed_matrix = self.features(torch.LongTensor(unique_nodes_list).cuda())
        else:
            embed_matrix = self.features(torch.LongTensor(unique_nodes_list))
            
        to_feats = mask.mm(embed_matrix) #[bs, unique]@[unique, F]
        return to_feats # [bs, F]


In [64]:

class Encoder_QB(nn.Module):
    """
    Encodes a node's using 'convolutional' GraphSage approach
    """
    def __init__(self, features, feature_dim,
            embed_dim, aggregator, num_sample,
            gcn=False, cuda=False): 
        super(Encoder_QB, self).__init__()
        self.features = features
        self.aggregator = aggregator
        self.num_sample = num_sample
        self.feat_dim = feature_dim
        self.gcn = gcn
        self.embed_dim = embed_dim
        self.cuda = cuda
        self.aggregator.cuda = cuda
        self.weight = nn.Parameter(
                torch.FloatTensor(embed_dim, self.feat_dim if self.gcn else 2 * self.feat_dim))
        init.xavier_uniform(self.weight)

    def forward(self, nodes):
        """
        Generates embeddings for a batch of nodes.
        nodes     -- list of nodes; [bs]
        operations: 已知self_feat: [bs, F]
                    再agg_feat: [bs, F]
                    然后concat([agg_feat, self_feat]): [bs, 2F]
                    然后W:[F2, 2F] 
                    relu(W @[bs,2F].T) : [F2, bs] 
        """
        # [bs, F]
        neigh_feats = self.aggregator.forward(nodes, self.num_sample)
        if not self.gcn:
            if self.cuda:
                self_feats = self.features(torch.LongTensor(nodes).cuda())
            else:
                self_feats = self.features(torch.LongTensor(nodes))
            combined = torch.cat([self_feats, neigh_feats], dim=1)
        else:
            combined = neigh_feats
        # [hid, bs] = relu([hid, 2f]@[2f, bs]) 
        combined = F.relu(self.weight.mm(combined.t()))
        return combined


In [102]:
class SupervisedGraphSage_QB(nn.Module):
    def __init__(self, num_classes, enc):
        super(SupervisedGraphSage_QB, self).__init__()
        self.enc = enc
        self.weight = nn.Parameter(torch.FloatTensor(num_classes, enc.embed_dim))
        init.xavier_uniform(self.weight)
        self.loss = nn.NLLLoss(reduce=False)
        self.logsoftmax = nn.LogSoftmax()
        self.softmax = nn.Softmax()

    def forward(self, nodes, softmax=False):
        embeds = self.enc(nodes)
        scores = self.weight.mm(embeds)
        if softmax:
            return self.softmax(scores)
        return scores.t()

    def loss_softmax_mask(self, nodes, labels, mask=None):
        '''
        nodes shape: [N];
        labels shape: [N];
        mask shape: [N];
        '''
        if mask is None:
            mask = torch.ones(len(labels))
        logits = self.forward(nodes)
        A_soft = self.logsoftmax(logits)
        output = self.loss(A_soft, labels.squeeze())
        loss = output*mask.type(torch.float)
        loss = sum(loss)/sum(mask)
        return loss


In [103]:
np.random.seed(1)
random.seed(1)
num_nodes = len(g4.nodes)
node_F = len(g4.node[0]['vec'])
feat_data = [g4.node[i]['vec'] for i in g4.nodes]
labels = np.array([g4.node[i]['label'] for i in g4.nodes])
idx2label = {i:j for i, j in enumerate(np.unique(labels))}
label2idx = {j:i for i,j in idx2label.items()}
labels = np.array([label2idx[i] for i in labels])
num_cls = len(np.unique(labels))
# feat_data, labels, adj_lists, _ = load_cora()
features = nn.Embedding(num_nodes, node_F)
features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False)

agg1 = MeanAggregator_QB(g4, features, cuda=True)
enc1 = Encoder_QB(features, node_F, 128, agg1, num_sample=5, gcn=False, cuda=False)
agg2 = MeanAggregator_QB(g4, lambda nodes : enc1(nodes).t(), cuda=False)
enc2 = Encoder_QB(lambda nodes : enc1(nodes).t(), 
               enc1.embed_dim, node_F, agg2, num_sample=5, gcn=False, cuda=False)


In [104]:
graphsage = SupervisedGraphSage_QB(num_cls, enc2)
#graphsage.cuda()
#rand_indices = np.random.permutation(len(node_list))
nodes = list(g4.nodes)
test_nodes
val = nodes[:50]
train = nodes[50:-100]
optimizer = torch.optim.SGD(filter(lambda p : p.requires_grad, graphsage.parameters()), lr=0.1)
times = []
Loss = []

for batch in range(2):
    batch_nodes2 = train[:40]
    batch_idx = copy.copy(train[:40])
    random.shuffle(train)
    start_time = time.time()
    optimizer.zero_grad()
    loss = graphsage.loss_softmax_mask(batch_nodes2,
            Variable(torch.tensor(labels[np.array(batch_idx)])))
    loss.backward()
    optimizer.step()
    end_time = time.time()
    times.append(end_time - start_time)
    val_output = graphsage.forward(val, softmax=True)
#     print(batch, loss.data, val_output.data)
    Loss.append(loss.data)
    
val_output = graphsage.forward(val)

result(g4, val_output, val, idx2label)

Validation F1: 0.58
Average batch time: 0.32449638843536377


In [141]:
def result(G, output, label_idx, idx2label):
    score, pred = output.max(dim=-1)
    sms = [G.node[i]['text'] for i in label_idx]
    df = pd.DataFrame()
    df['sms']= sms
    df['score']=score.data
    df['pred']=[idx2label[i] for i in pred.numpy()]
    df['label']=[g4.node[i]['label'] for i in label_idx]
    return df

In [142]:
result(g4, val_output, val, idx2label)

Unnamed: 0,sms,score,pred,label
0,your indusind bank a/c no.100***942932 has bee...,0.323853,交易流水＿转账,交易流水＿转账
1,your a/c no. xxxxxxxxxx9359 is credited by rs....,0.542738,交易流水＿转账,交易流水＿转账
2,your indusind bank a/c no.159***479359 has bee...,0.333873,交易流水＿转账,交易流水＿转账
3,"dear customer, thank you for your internet pay...",0.341169,交易流水＿转账,交易流水＿转账
4,"dear customer, statement for your indusind ban...",0.370236,交易流水＿转账,信用卡＿还款提醒
5,your curr loan emi rs.2865 for ahb00485s due o...,0.338417,交易流水＿转账,贷后提醒＿逾期催收
6,your a/c no. xxxxxx2932 is credited for rs. 30...,0.419287,交易流水＿转账,交易流水＿转账
7,"dear customer, as per the last statement gener...",0.428305,交易流水＿转账,信用卡＿逾期警告
8,"dear customer, your account no 159***479359 ha...",0.303896,交易流水＿转账,交易流水＿转账
9,ur curr loan emi rs. 2789.00 for awp04070h due...,0.332015,交易流水＿转账,贷后提醒＿到期提醒


In [30]:
idx2label[np.argmax(graphsage.forward([3]).squeeze(0).data.numpy())]

'交易流水＿转账'

In [None]:
def make_dot(var, params=None):
    """ Produces Graphviz representation of PyTorch autograd graph
    Blue nodes are the Variables that require grad, orange are Tensors
    saved for backward in torch.autograd.Function
    Args:
        var: output Variable
        params: dict of (name, Variable) to add names to node that
            require grad (TODO: make optional)
    """
    if params is not None:
        assert isinstance(params.values()[0], Variable)
        param_map = {id(v): k for k, v in params.items()}
 
    node_attr = dict(style='filled',
                     shape='box',
                     align='left',
                     fontsize='12',
                     ranksep='0.1',
                     height='0.2')
    dot = Digraph(node_attr=node_attr, graph_attr=dict(size="12,12"))
    seen = set()
 
    def size_to_str(size):
        return '('+(', ').join(['%d' % v for v in size])+')'
 
    def add_nodes(var):
        if var not in seen:
            if torch.is_tensor(var):
                dot.node(str(id(var)), size_to_str(var.size()), fillcolor='orange')
            elif hasattr(var, 'variable'):
                u = var.variable
                name = param_map[id(u)] if params is not None else ''
                node_name = '%s\n %s' % (name, size_to_str(u.size()))
                dot.node(str(id(var)), node_name, fillcolor='lightblue')
            else:
                dot.node(str(id(var)), str(type(var).__name__))
            seen.add(var)
            if hasattr(var, 'next_functions'):
                for u in var.next_functions:
                    if u[0] is not None:
                        dot.edge(str(id(u[0])), str(id(var)))
                        add_nodes(u[0])
            if hasattr(var, 'saved_tensors'):
                for t in var.saved_tensors:
                    dot.edge(str(id(t)), str(id(var)))
                    add_nodes(t)
    add_nodes(var.grad_fn)
    return dot