In [1]:
# 基于baseline 实验，1300多个模版，无法利用倍化方法提升测试集准确率，(倍化只不过让过拟合来的更早一些) 得出必然是一个少样本半监督问题；
# 因此实验 GCN -- 它是一个半监督结构 同时 它可以利用邻居节点信息；

# bug结果： 构图相似度（对称归一拉普拉斯完全忽略邻居） nx.norm_laplacian 有问题；
# 修正： 自定义 norm laplacian fun
# 结果： 测试集 18分类 见下面结果图；

# insights： 得到的对称归一拉普拉斯矩阵 给出自身节点过量权重-暗示局部簇稀疏性； 
# 进一步猜想：
# 以上基于相似度的构图； 其相似度基于 sms 的tf-idf； 

'''
'dear customer txn on indusind bank credit card no xx5007 for 
inr 2741 on 26/08/18 18:20 at rajpath motors is approved.pls call 
    18602677777 for query.click on http://bit.ly/2b8nsnl to update 
        your aadhaar number, ignore if already done.',
           
'your a/c no. xxxxxxxxxx9359 is credited by rs.5,500.00 on 18/09/18 
by a/c linked to mobile 9xxxxxx9359 (imps ref no 826119422613).'
'''

# 这两个例子都是低tf-idf相似度，然而本应该高相似度；
# 而实际上应当基于核心词汇（credit, debited, transaction, balance .. ）
# 我可以通过textRank 构建这样的核心词汇表， 然后重新编码 sms 向量 BOW；
# no no no, 仍用tf-idf 构建文本编码，但是构图相似度用 textRank tf-idf构建；

# bug结果： normlized laplacian 有问题， normadj@X 输出nan ????

"\n'dear customer txn on indusind bank credit card no xx5007 for \ninr 2741 on 26/08/18 18:20 at rajpath motors is approved.pls call \n    18602677777 for query.click on http://bit.ly/2b8nsnl to update \n        your aadhaar number, ignore if already done.',\n           \n'your a/c no. xxxxxxxxxx9359 is credited by rs.5,500.00 on 18/09/18 \nby a/c linked to mobile 9xxxxxx9359 (imps ref no 826119422613).'\n"

In [9]:
import networkx as nx
import community # !pip install python-louvain
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from numpy import dot
from numpy.linalg import norm

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [10]:
def get_cos_similarity(sms, templates):
    '''计算一条新sms 与 每个template 相似度;'''
    def cos_sim(a, b):
        return dot(a, b) / (norm(a) * norm(b))
    return [cos_sim(i, sms) for i in templates]


def tfIdfVector(corpus):
    '''
    corpus is a list of sentences:
    ['This is an example', 'hello world', ...]
    '''
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    x = vectorizer.fit_transform(corpus)
    tfidf = transformer.fit_transform(x)
    return tfidf.toarray()

def normalise_adj_matrix(A):
    '''
    normalized laplacian matrix;
    '''
    num_node = A.shape[0]
    A_hat = A + np.eye(num_node) 
    D_isqrtm = np.diag(np.power(A_hat.sum(axis = 1), -0.5))
    return D_isqrtm.dot(A_hat).dot(D_isqrtm)

def get_pr(Res, cls):
    #某类 正确识别数量/该类 总识别数量
    tmp  = Res[Res.true == cls]
    a = sum(tmp.true.values == tmp.pred.values)
    b = Res[Res.pred == cls].shape[0]
    pr = a/(b+1e-7)
    return pr

def get_rc(Res, cls):
    #某类 正确识别数量/该类 总数量
    tmp  = Res[Res.true == cls]
    support = tmp.shape[0]
    a = sum(tmp.true.values == tmp.pred.values)
    b = Res[Res.true == cls].shape[0]
    rc = a/(1e-7+b)
    return rc,support

def get_f1(pr, rc):
    # f1 = (2*pr*rc)/(pr+rc)
    f1 = (2*pr*rc)/(pr+rc+1e-7)
    return f1

def evaluate(Res, cls):
    pr = get_pr(Res, cls)
    rc, support = get_rc(Res, cls)
    f1 = get_f1(pr, rc)
    return [cls, pr, rc, f1, support]

def Final_evalu(z):
    Res = pd.DataFrame(columns=['true', 'pred'])
    true, pred = z.label.tolist(), z.pred.tolist()
    Res['true'] = true
    Res['pred'] = pred
    entitys = Res.true.unique()
    records = []
    for i in entitys:
        tmp = evaluate(Res, i)
        records.append(tmp)
    record = pd.DataFrame.from_records(records)
    record.columns =['cls','精确率','召回率', 'F1', 'support']
    record = record.set_index('cls')
    record = record.sort_index()   
    return record

def softmax(x):
    exp_x = np.exp(x)
    softmax_x = exp_x / np.sum(exp_x)
    return softmax_x 

def evalu(text, Y_hat, label, labels2idx):
    preds = Y_hat.eval().argmax(axis=-1)
    scores = np.array([softmax(i) for i in Y_hat.eval()])
    score = scores.max(axis=-1)
    idx2label = {i:j for j,i in labels2idx.items()}
    pred = [[idx2label[i] for i in preds]]
    df = pd.DataFrame()
    df['text'] = text
    df['pred'] = pred
    df['label'] = label
    df['score'] =score
    return df

def evalu2(text, Y_hat, label, labels2idx):
    scores = np.array([softmax(i) for i in Y_hat.eval()])
    sort_score = scores.argsort(axis=-1)
    idx2label = {i:j for j,i in labels2idx.items()}
    sort_labels = [[idx2label[j] for j in i][:3] for i in sort_score]
    sort_scores = [i[np.argsort(-i)][:3] for i in scores]
    preds = Y_hat.eval().argmax(axis=-1)
    idx2label = {i:j for j,i in labels2idx.items()}
    preds = [idx2label[i] for i in preds]
    df = pd.DataFrame()
    df['text'] = text
    df['pred2'] = sort_labels
    df['label'] = label
    df['score'] = sort_scores
    df['pred'] = preds
    return df


## v1: 纯GCN nx 的归一化拉普拉斯矩阵；


In [13]:
gsm_templates_df

Unnamed: 0,sms,label
0,dear customer txn on indusind bank credit card...,交易流水＿转账
1,your indusind bank a/c no.100***942932 has bee...,交易流水＿转账
2,your a/c no. xxxxxxxxxx9359 is credited by rs....,交易流水＿转账
3,your indusind bank a/c no.159***479359 has bee...,交易流水＿转账
4,your a/c no. xxxxxxxxxx2932 is debited for rs....,交易流水＿转账
5,"dear customer, your account no 100***942932 ha...",交易流水＿转账
6,"dear customer, thank you for your internet pay...",交易流水＿转账
7,"dear customer, statement for your indusind ban...",信用卡＿还款提醒
8,your curr loan emi rs.2865 for ahb00485s due o...,贷后提醒＿逾期催收
9,your a/c no. xxxxxx2932 is credited for rs. 30...,交易流水＿转账


In [12]:
gsm_templates_df = pd.read_csv('gsm_templates_df.csv')
corpus, labels = gsm_templates_df.sms.tolist(), gsm_templates_df.label.tolist()

all_tfidf = tfIdfVector(corpus)
num_sms = 100

from sklearn.metrics.pairwise import cosine_similarity
adj_mat = cosine_similarity(all_tfidf)
# adj_mat = np.where(adj_mat>0.5, 1, 0)
G2 = nx.from_numpy_matrix(adj_mat)
# norm_adj_mat = nx.normalized_laplacian_matrix(G2).toarray()
norm_adj  = normalise_adj_matrix(adj_mat)
for i in range(len(corpus)):
    G2.node[i]['vec'] =  all_tfidf[i]
    G2.node[i]['text'] = corpus[i]
    G2.node[i]['label'] = labels[i]

In [8]:
labels2idx = {j:i for i,j in enumerate(set(labels))}
labels_new = [labels2idx[i] for i in labels]

In [10]:
X = np.array([G2.node[i]['vec'] for i in list(G2.nodes)])
Y = np.array(labels_new)
num_class = len(set(Y))
num_new_sms = 100

sess = tf.InteractiveSession()
X = tf.constant(X)
A = tf.constant(norm_adj_mat, dtype='float64')
Y = tf.constant(Y)
masks = np.ones(len(G2.nodes))
masks[-num_new_sms:] = 0
masks = tf.constant(masks)

Z = tf.layers.dense(tf.matmul(A, X), units=2, use_bias=False, activation=tf.nn.tanh)
Y_hat = tf.layers.dense(tf.matmul(A, Z), units=num_class, use_bias=False, activation=None)
tmp_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=Y_hat, labels=Y)

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.


In [11]:
mask_ids = range(len(labels_new))[-num_new_sms:]

loss = tf.reduce_mean(tmp_loss * masks)

opt = tf.train.AdamOptimizer(0.01)

train = opt.minimize(loss)

sess.run(tf.global_variables_initializer())

for I in range(10000):
    _, loss_val = sess.run([train, loss])
    if I % 100 == 0:
        Z_val = Z.eval()
        df  = evalu2(corpus, Y_hat, labels, labels2idx)
        res = Final_evalu(df.iloc[-100:])
        print('####### batch i:{} #######'.format(I))
        print('loss = %f' % loss_val)
        mean_tra_f1 = np.mean([i for i in Final_evalu(df.iloc[:-100]).F1.values if i != 0])
        print('mean f1 of train data:{}'.format(mean_tra_f1))
        
        mean_f1 = np.mean([i for i in Final_evalu(df.iloc[-100:]).F1.values if i != 0])
        print('mean f1 of test 100:{}'.format(mean_f1))

## v1: 纯GCN 结果

In [15]:
Final_evalu(df.iloc[-100:])

Unnamed: 0_level_0,精确率,召回率,F1,support
cls,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
sms_other,0.0,0.0,0.0,3
交易流水＿余额,0.0,0.0,0.0,3
交易流水＿转账,0.0,0.0,0.0,57
信用卡＿还款提醒,0.5,0.176471,0.26087,17
信用卡＿逾期警告,0.0,0.0,0.0,5
账号异常＿余额不足,0.0,0.0,0.0,1
账号异常＿卡号冻结,0.0,0.0,0.0,1
账号异常＿扣款失败,0.044444,0.5,0.081633,4
贷前申请＿审核拒绝,0.0,0.0,0.0,2
贷前申请＿申请交互,0.0,0.0,0.0,1


## v1: 纯GCN 结果 归一化拉普拉斯自己写

In [12]:
def normalise_adj_matrix(A):
    '''
    N = D^{-1/2} L D^{-1/2}
    '''
    num_node = A.shape[0]
#     A_hat = A + np.eye(num_node) 
    A_hat = A
    D_isqrtm = np.diag(np.power(A_hat.sum(axis=1), -0.5))
    print(D_isqrtm)
    return D_isqrtm.dot(A_hat).dot(D_isqrtm)

In [26]:
gsm_templates_df = pd.read_csv('gsm_templates_df.csv')
corpus, labels = gsm_templates_df.sms.tolist(), gsm_templates_df.label.tolist()
from sklearn.feature_extraction.text import TfidfVectorizer
all_tfidf = tfIdfVector(corpus)
num_sms = 100

from sklearn.metrics.pairwise import cosine_similarity
adj_mat = cosine_similarity(all_tfidf)
# adj_mat = np.where(adj_mat>0.5, 1, 0)
G2 = nx.from_numpy_matrix(adj_mat)
# norm_adj_mat = nx.normalized_laplacian_matrix(G2).toarray()
norm_adj  = normalise_adj_matrix(adj_mat)
for i in range(len(corpus)):
    G2.node[i]['vec'] =  all_tfidf[i]
    G2.node[i]['text'] = corpus[i]
    G2.node[i]['label'] = labels[i]

[[0.11104316 0.         0.         ... 0.         0.         0.        ]
 [0.         0.11254343 0.         ... 0.         0.         0.        ]
 [0.         0.         0.10451165 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.14580781 0.         0.        ]
 [0.         0.         0.         ... 0.         0.11851082 0.        ]
 [0.         0.         0.         ... 0.         0.         0.12281465]]


In [14]:
labels2idx = {j:i for i,j in enumerate(set(labels))}
labels_new = [labels2idx[i] for i in labels]

In [15]:
X = np.array([G2.node[i]['vec'] for i in list(G2.nodes)])
Y = np.array(labels_new)
num_class = len(set(Y))
num_new_sms = 100

sess = tf.InteractiveSession()
X = tf.constant(X)
A = tf.constant(norm_adj, dtype='float64')
Y = tf.constant(Y)
masks = np.ones(len(G2.nodes))
masks[-num_new_sms:] = 0
masks = tf.constant(masks)

Z = tf.layers.dense(tf.matmul(A, X), units=2, use_bias=False, activation=tf.nn.tanh)
Y_hat = tf.layers.dense(tf.matmul(A, Z), units=num_class, use_bias=False, activation=None)
tmp_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=Y_hat, labels=Y)

In [16]:
mask_ids = range(len(labels_new))[-num_new_sms:]

loss = tf.reduce_mean(tmp_loss * masks)

opt = tf.train.AdamOptimizer(0.01)

train = opt.minimize(loss)

sess.run(tf.global_variables_initializer())

for I in range(10000):
    _, loss_val = sess.run([train, loss])
    if I % 100 == 0:
        Z_val = Z.eval()
        df  = evalu2(corpus, Y_hat, labels, labels2idx)
        res = Final_evalu(df.iloc[-100:])
        print('####### batch i:{} #######'.format(I))
        print('loss = %f' % loss_val)
        mean_tra_f1 = np.mean([i for i in Final_evalu(df.iloc[:-100]).F1.values if i != 0])
        print('mean f1 of train data:{}'.format(mean_tra_f1))
        
        mean_f1 = np.mean([i for i in Final_evalu(df.iloc[-100:]).F1.values if i != 0])
        print('mean f1 of test 100:{}'.format(mean_f1))

####### batch i:0 #######
loss = 2.648993
mean f1 of train data:nan
mean f1 of test 100:nan
####### batch i:100 #######
loss = 1.423873
mean f1 of train data:0.7788300359244914
mean f1 of test 100:0.726114602507204


KeyboardInterrupt: 

In [11]:
Final_evalu(df.iloc[-100:])

Unnamed: 0_level_0,精确率,召回率,F1,support
cls,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
sms_other,0.0,0.0,0.0,3
交易流水＿余额,1.0,0.333333,0.5,3
交易流水＿转账,0.808824,0.964912,0.88,57
信用卡＿还款提醒,0.666667,0.588235,0.625,17
信用卡＿逾期警告,1.0,0.2,0.333333,5
账号异常＿余额不足,0.25,1.0,0.4,1
账号异常＿卡号冻结,0.0,0.0,0.0,1
账号异常＿扣款失败,0.0,0.0,0.0,4
贷前申请＿审核拒绝,0.0,0.0,0.0,2
贷前申请＿申请交互,0.25,1.0,0.4,1


## v2-textRank

In [29]:
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import networkx as nx
import matplotlib.pyplot as plt

def rollWindPairs(filtered_sents, windowSize=5):
    '''
    filtered_sents: [['token1','token2', ..], [. , . ,], ..]
    '''
    assert isinstance(filtered_sents[0], list)
    token_pairs = []
    for sent in filtered_sents:
        for i, word in enumerate(sent):
            for j in range(i + 1, i + window_size):
                if j >= len(sent):
                    break
                pair = (word, sent[j])
                if pair not in token_pairs:
                    token_pairs.append(pair)
    return token_pairs

def filter_sents(sents, candidate_pos, stop_words):
    '''
    sents:['sms1', 'sms2', ...]
    1. stopword filter ; 
    2. postag filter ; 
    return: [['token1','token2', ..], [. , . ,], ..]
    '''
    sents_new = []
    for i in sents:
        token_lis = i.split()
        token_lis = nltk.pos_tag(token_lis)
        sent = [i[0].lower() for i in token_lis 
                if (i[1] in candidate_pos) and (i[0] not in stop_words)]
        sents_new.append(sent)
    return sents_new


In [30]:
gsm_templates_df = pd.read_csv('gsm_templates_df.csv')
corpus, labels = gsm_templates_df.sms.tolist(), gsm_templates_df.label.tolist()
from sklearn.feature_extraction.text import TfidfVectorizer
all_tfidf = tfIdfVector(corpus)
labels2idx = {j:i for i,j in enumerate(set(labels))}
labels_new = [labels2idx[i] for i in labels]

In [31]:
# step 1; rankText IO:
window_size = 3
sents = gsm_templates_df.sms.tolist()
stop_words = set(stopwords.words('english'))
cand_pos = ['NN', 'VBN', 'VBD']
filtered_sents = filter_sents(sents, cand_pos, stop_words)
print('filter processed.')
pathList = rollWindPairs(filtered_sents, windowSize=4)

G = nx.Graph()
G.add_edges_from(pathList)
pr_value = nx.pagerank(G, alpha=1)
pr_impro_value = nx.pagerank(G, alpha=0.85)

# 老板词过滤文本
BossToken = list(sorted(pr_impro_value.items(), key=lambda x: x[1], reverse=True))[:150]
BossToken = [i[0] for i in BossToken]
gsm_new_template = []
for sms in gsm_templates_df.sms.tolist():
    new_sms = [] 
    for token in sms.split():
        if token in BossToken:
            new_sms.append(token)
    gsm_new_template.append(' '.join(new_sms))

gsm_templates_df['new_sms'] = gsm_new_template

filter processed.


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 过滤后的文本 重新向量化；
text = gsm_templates_df.new_sms.tolist()
vectorizer = TfidfVectorizer()
vec_text = vectorizer.fit_transform(text).toarray()
adj_mat = cosine_similarity(vec_text)
# adj_mat = np.where(adj_mat > 0.5, 1, 0)

def normalise_adj_matrix(A):
    '''
    N = D^{-1/2} L D^{-1/2}
    '''
    num_node = A.shape[0]
#     A_hat = A + np.eye(num_node) 
    A_hat = A
    D_isqrtm = np.diag(np.power(A_hat.sum(axis=1), -0.5))
    print(D_isqrtm)
    return D_isqrtm.dot(A_hat).dot(D_isqrtm)

norm_adj  = normalise_adj_matrix(adj_mat)

[[0.08240205 0.         0.         ... 0.         0.         0.        ]
 [0.         0.08445456 0.         ... 0.         0.         0.        ]
 [0.         0.         0.08508444 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.13062884 0.         0.        ]
 [0.         0.         0.         ... 0.         0.11096296 0.        ]
 [0.         0.         0.         ... 0.         0.         0.13562951]]


In [33]:
norm_adj@all_tfidf ###

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [20]:
X = np.array(all_tfidf)
Y = np.array(labels_new)
num_class = len(set(Y))
num_new_sms = 100

sess = tf.InteractiveSession()
X = tf.constant(X)
A = tf.constant(norm_adj, dtype='float64')
Y = tf.constant(Y)
masks = np.ones(len(all_tfidf))
masks[-num_new_sms:] = 0
masks = tf.constant(masks)

Z = tf.layers.dense(tf.matmul(A, X), units=2, use_bias=False, activation=tf.nn.tanh)
Y_hat = tf.layers.dense(tf.matmul(A, Z), units=num_class, use_bias=False, activation=None)
tmp_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=Y_hat, labels=Y)

In [21]:
mask_ids = range(len(labels_new))[-num_new_sms:]

loss = tf.reduce_mean(tmp_loss * masks)

opt = tf.train.AdamOptimizer(0.01)

train = opt.minimize(loss)

sess.run(tf.global_variables_initializer())

for I in range(10000):
    _, loss_val = sess.run([train, loss])
    if I % 100 == 0:
        Z_val = Z.eval()
        df  = evalu2(corpus, Y_hat, labels, labels2idx)
        res = Final_evalu(df.iloc[-100:])
        print('####### batch i:{} #######'.format(I))
        print('loss = %f' % loss_val)
        mean_tra_f1 = np.mean([i for i in Final_evalu(df.iloc[:-100]).F1.values if i != 0])
        print('mean f1 of train data:{}'.format(mean_tra_f1))
        
        mean_f1 = np.mean([i for i in Final_evalu(df.iloc[-100:]).F1.values if i != 0])
        print('mean f1 of test 100:{}'.format(mean_f1))

####### batch i:0 #######
loss = nan
mean f1 of train data:0.10051992111170938
mean f1 of test 100:0.058252421415779444


KeyboardInterrupt: 

# personalRank: pinsage 中重要性采样技巧；


In [1]:
def getResource(csvpath):
    '''
    获取原始数据
    :param csvpath: csv路径
    :return: frame
    '''
    frame = pd.read_csv(csvpath)
    return frame

In [2]:
def getUserGraph(frame, userID=1):
    '''
    获取目标用户二分图, 不计权重
    :param frame: ratings数据
    :param userID: 目标ID
    :return: 二分图字典
    '''
    print(userID)
    itemList = list(set(frame[frame['UserID']==userID]['MovieID']))
    graphDict = {'i'+str(item): 1 for item in itemList}
    return graphDict

In [3]:
def getItemGraph(frame, itemID=1):
    '''
    获取目标物品二分图, 不计权重
    :param frame: ratings数据
    :param userID: 目标ID
    :return: 二分图字典
    '''
    print(itemID)
    userList = list(set(frame[frame['MovieID']==itemID]['UserID']))
    graphDict = {'u'+str(user): 1 for user in userList}
    return graphDict

In [4]:
def initGraph(frame):
    '''
    初始化二分图
    :param frame: ratings数据集
    :return: 二分图
    '''
    userList = list(set(frame['UserID']))
    itemList = list(set(frame['MovieID']))
    G = {'u'+str(user): getUserGraph(frame, user) for user in userList}
    for item in itemList: 
        G['i'+str(item)] = getItemGraph(frame, item)
    return G

In [5]:
def personalRank(G, alpha, userID, iterCount=20):
    '''
    随机游走迭代
    :param G: 二分图
    :param alpha: 随机游走的概率
    :param userID: 目标用户
    :param iterCount: 迭代次数
    :return: series
    '''
    rank = {g: 0 for g in G.keys()}
    rank['u'+str(userID)] = 1                                       #根节点为起点选择概率为1,其他顶点为0
    for k in range(iterCount):
        tmp = {g: 0 for g in G.keys()}
        for i, ri in G.items():                                     #遍历每一个顶点
            for j, wij in ri.items():                               #遍历每个顶点连接的顶点
                tmp[j] += alpha * rank[i] / len(ri)
        tmp['u' + str(userID)] += 1 - alpha                         #根顶点r=1，加上1-alpha
        rank = tmp
    series = pd.Series(list(rank.values()), index=list(rank.keys()))
    series = series.sort_values(ascending=False)
    return series                                                   #返回排序后的series

In [6]:
def recommend(frame, series, userID, TopN=10):
    '''
    推荐TopN个用户没有评分的物品
    :param frame: ratings数据
    :param series: series
    :param userID: 目标用户
    :param TopN: TopN
    :return: 推荐物品
    '''
    itemList = ['i'+str(i) for i in list(set(frame[frame['UserID']==userID]['MovieID']))]
    recommendList = [{u: series[u]} for u in list(series.index) if u not in itemList and 'u' not in u]
    return recommendList[:TopN]

In [None]:
def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el

def sample_baseon_cls(df, split_rate = 0.2):
    '''
    train test split based on diff cls;
    df with type: 
        ##......sms.......#......cls......##
        ##'sms example 1.'#  交易流水＿转账 ##
        ##'sms example 2.'#  交易流水＿转账 ##
        ##'sms example 3.'#  交易流水＿余额 ##
        ##'sms example 4.'#  交易流水＿其他 ##
    '''
    test_idx_list = []
    classes = set(df.cls)
    labels = df.cls
    for i in classes:
        sample_all_idxs = np.where(labels==i)[0]
        num_samples = int(split_rate*len(sample_all_idxs))
        if num_samples == 0:
            print('Warning! MAKE SURE U HAVE SUFFICIENT SAMPLES FOR CLASS :{}'.format(i))
        sample_idxs = np.random.choice(sample_all_idxs, size = num_samples)
        print('class_name:{}, num_samples:{}, num_test_sample:{}'.format(i, len(sample_all_idxs), len(sample_idxs)))
        test_idx_list.append(sample_idxs)
    test_idx_list = list(flatten(test_idx_list))
    df.index = range(len(df))
    test = df[df.index.isin(test_idx_list)]
    train = df[~df.index.isin(test_idx_list)]
    return train, test

In [None]:
train, test = sample_baseon_cls(df3)