In [6]:
import json
import os
import re
import collections
import numpy as np
import pandas as pd
import time
import tensorflow as tf
import LAC
from sklearn.feature_selection import RFE
import happybase
from sklearn.cluster import KMeans 
import joblib
import sklearn
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 
tf.config.set_visible_devices([], 'GPU')

In [2]:
sklearn.__version__

'0.23.2'

In [3]:
import tensorflow as tf

def build_model(
    input_number:int,
    sentence_maxlen:int,
    vocab_size:int,
    tag_size:int,
    embedding_dim:int,
    embedding_matrix=None,
    is_embedding_training:bool=True,
    embedding_dropout_rate:float = 0.0,
    learning_rate = 1e-3,
    ):
    """建立模型
    input:
        sentence_maxlen : 句子的长度
        vocab_size : 词的个数
        tag_size : 分类的个数
        embedding_dim : word2vec训练时设置的向量长度
        embedding_matrix : word2vec词向量矩阵
        is_embedding_training : embedding层是否加入训练
        embedding_dropout_rate : embedding层dropout的比率
    """

    input_list =[ 
        tf.keras.layers.Input(shape=(sentence_maxlen,),name="input{}".format(i))
        for i in range(input_number)
    ]
    #embedding层
    if not (embedding_matrix is None):
        embedding_layer = tf.keras.layers.Embedding(
            input_dim = vocab_size,
            output_dim = embedding_dim,
            embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix),
            trainable = is_embedding_training,
            input_length = sentence_maxlen,
        )
    else:
        embedding_layer = tf.keras.layers.Embedding(
            input_dim = vocab_size,
            output_dim = embedding_dim,
            trainable = is_embedding_training,
            input_length = sentence_maxlen,
        )
    output_list = [
        embedding_layer(input_list[i]) for i in range(input_number)
    ]

    embedding_dropout_layer = tf.keras.layers.Dropout(embedding_dropout_rate)
    cnn_layer0 = tf.keras.layers.Conv1D(filters=128, kernel_size=10, strides=1, padding='valid',activation="tanh")
    cnn_layer1 = tf.keras.layers.MaxPool1D(2, padding='valid')
    cnn_layer2 = tf.keras.layers.Flatten()
    for layer in [embedding_dropout_layer,cnn_layer0,cnn_layer1,cnn_layer2]:
        output_list = [
            layer(output_list[i]) for i in range(input_number)
        ]
    output = tf.keras.layers.Concatenate()(output_list)
    
    # output = tf.keras.layers.Concatenate()(output_list)
    # output = tf.keras.layers.Dropout(embedding_dropout_rate)(output)
    # output = tf.keras.layers.Conv1D(filters=512, kernel_size=20, strides=1, padding='valid',activation="tanh")(output)
    # output = tf.keras.layers.MaxPool1D(2, padding='valid')(output)
    # output = tf.keras.layers.Flatten()(output)

    output = tf.keras.layers.Dropout(0.5)(output)
    #output = tf.keras.layers.Dense(16, activation='tanh',kernel_regularizer=tf.keras.regularizers.l2())(output)

    if tag_size > 2:
        output = tf.keras.layers.Dense(tag_size, activation='softmax',use_bias=True,kernel_regularizer=tf.keras.regularizers.l2())(output)
        # output = tf.keras.layers.Dense(tag_size, activation='softmax',use_bias=True)(output)
        print("这是一个多分类模型")
    elif tag_size == 2:
        output = tf.keras.layers.Dense(1, activation='sigmoid',use_bias=True,kernel_regularizer=tf.keras.regularizers.l2())(output)
        # output = tf.keras.layers.Dense(1, activation='sigmoid',use_bias=True)(output)
        print("这是一个二分类模型")
    else:
        raise Exception("类别错误")

    model = tf.keras.Model(inputs=input_list, outputs=output, name='multi_textcnn')

    if tag_size > 2:
        metric_list = [
            tf.keras.metrics.SparseCategoricalAccuracy(),
        ]
        model.compile(
            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
            optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate,
                clipnorm=1.0,
                clipvalue=0.5,
            ),
            metrics = metric_list,
        )
    else:
        metric_list = [
            tf.keras.metrics.BinaryAccuracy(),
            tf.keras.metrics.AUC(num_thresholds=10000)
        ]
        model.compile(
            loss = tf.keras.losses.BinaryCrossentropy(),
            optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3,
                clipnorm=1.0,
                clipvalue=0.5,
            ),
            metrics = metric_list,
        )
    return model

In [4]:
import json
import gensim
import os
import LAC
import tensorflow as tf
import numpy as np
import pandas as pd
import sklearn.utils

APP_DIR  = os.path.dirname(os.path.realpath('__file__'))

def trans_gensim_word2vec2tf_embedding(word2vector_file_path:str):
    """把gensim的word2vec结果转化为tf.keras.layers.Embedding需要的结果
    """

    word2vec_model = gensim.models.Word2Vec.load(word2vector_file_path)

    #所有的词
    word_list = [word for word, word_info in word2vec_model.wv.vocab.items()]

    #词到index的映射
    word2index_dict = {"<PADDING>": 0, "<UNK>":1}

    #保存特殊词的padding
    specical_word_count = len(word2index_dict)

    #词到词向量的映射
    word2vector_dict = {}

    #初始化embeddings_matrix

    embeddings_matrix = np.zeros((len(word_list) + specical_word_count, word2vec_model.vector_size))
    #初始化unk为-1,1分布
    embeddings_matrix[word2index_dict["<UNK>"]] = (1 / np.sqrt(len(word_list) + specical_word_count) * (2 * np.random.rand(word2vec_model.vector_size) - 1))

    for i,word in enumerate(word_list):
        #从0开始
        word_index = i + specical_word_count
        word2index_dict[str(word)] = word_index
        word2vector_dict[str(word)] = word2vec_model.wv[word] # 词语：词向量
        embeddings_matrix[word_index] = word2vec_model.wv[word]  # 词向量矩阵

    #写入文件
    with open(os.path.join(APP_DIR,"data","word2index.json"),"w",encoding="utf8") as f:
        json.dump(word2index_dict,f,ensure_ascii=False)

    return embeddings_matrix,word2vector_dict,word2index_dict


def trans2index(word2index_dict,word):
    """转换"""
    if word in word2index_dict:
        return word2index_dict[word]
    else:
        if "<UNK>" in word2index_dict:
            return word2index_dict["<UNK>"]
        else:
            raise ValueError("没有这个值，请检查")


def trans_data2tf_data(data_file_path:str,x_max_length:int=None,word2index_dict=None):
    """把data文件转化为tf.data
    """

    tag2index_dict = {}
    tag_index_count = len(tag2index_dict)
    lac = LAC.LAC(mode="seg")

    df = pd.read_csv(data_file_path)

    x_list = []
    for doc in df["content"]:
        word_list = lac.run(doc)
        x_list.append([trans2index(word2index_dict,word) for word in word_list])
    x_npa = np.array(x_list)

    y_list = []
    for tag in df["tag"]:
        tag = tag.strip()
        if not (tag in tag2index_dict):
            tag2index_dict[tag] = tag_index_count
            tag_index_count += 1
        y_list.append(tag2index_dict[tag])
    y_npa = np.array(y_list,dtype=np.uint8)

    print("x_list[:1]:{}".format(x_list[:1]))
    print("y_list[:1]:{}".format(y_list[:1]))

    #写入文件
    with open(os.path.join(APP_DIR,"data/tag2index.json"),"w",encoding="utf8") as f:
        json.dump(tag2index_dict,f,ensure_ascii=False)

    if not x_max_length:
        x_max_length0 = np.max(np.array([len(v) for v in x_list]))
        x_max_length = int(np.max(np.percentile(np.array([len(v) for v in x_list]),99.7)))
        print("数据集中最长的句子长度为:{},设定的最长的句子长度为:{}".format(x_max_length0,x_max_length))

    x_npa = tf.keras.preprocessing.sequence.pad_sequences(x_npa,maxlen=x_max_length,dtype=np.int32,truncating="post", padding='post',value=0)

    x_npa,y_npa = sklearn.utils.shuffle(x_npa,y_npa,random_state=0)
    print("x_npa[:1]:{}".format(x_npa[:1]))
    print("y_npa[:1]:{}".format(y_npa[:1]))
    print("x_npa.shape = {}".format(x_npa.shape))
    print("y_npa.shape = {}".format(y_npa.shape))

    return x_npa,y_npa,tag2index_dict



def trans_tokenize_data2tf_data(data_file_path:str,x_max_length:int=None,word2index_dict=None):
    """把已经分好词的data文件转化为tf.data
    """

    tag2index_dict = {}
    tag_index_count = len(tag2index_dict)
    lac = LAC.LAC(mode="seg")

    x_list = []
    y_list = []
    with open(data_file_path) as f:
        for line in f:
            temp_dict = json.loads(line.strip())
            word_list = temp_dict["content_tokenize"]
            tag = temp_dict["tag"].strip()
            if not (tag in tag2index_dict):
                tag2index_dict[tag] = tag_index_count
                tag_index_count += 1
            x_list.append([trans2index(word2index_dict,word) for word in word_list])
            y_list.append(tag2index_dict[tag])
    x_npa = np.array(x_list)
    y_npa = np.array(y_list,dtype=np.uint8)

    print("x_list[:1]:{}".format(x_list[:1]))
    print("y_list[:1]:{}".format(y_list[:1]))

    #写入文件
    with open(os.path.join(APP_DIR,"data/tag2index.json"),"w",encoding="utf8") as f:
        json.dump(tag2index_dict,f,ensure_ascii=False)

    if not x_max_length:
        x_max_length0 = np.max(np.array([len(v) for v in x_list]))
        x_max_length = int(np.max(np.percentile(np.array([len(v) for v in x_list]),99.7)))
        print("数据集中最长的句子长度为:{},设定的最长的句子长度为:{}".format(x_max_length0,x_max_length))

    x_npa = tf.keras.preprocessing.sequence.pad_sequences(x_npa,maxlen=x_max_length,dtype=np.int32,truncating="post", padding='post',value=0)

    x_npa,y_npa = sklearn.utils.shuffle(x_npa,y_npa,random_state=0)
    print("x_npa[:1]:{}".format(x_npa[:1]))
    print("y_npa[:1]:{}".format(y_npa[:1]))
    print("x_npa.shape = {}".format(x_npa.shape))
    print("y_npa.shape = {}".format(y_npa.shape))

    return x_npa,y_npa,tag2index_dict


def trans_multi_input_tokenize_data2npa(data_file_path:str,x_max_length:int=None,word2index_dict=None):
    """把已经分好词的data文件转化为tf.data , 多输入版本
    """

    tag2index_dict = {}
    tag_index_count = len(tag2index_dict)

    x_list = []
    y_list = []
    with open(data_file_path) as f:
        for line in f:
            temp_dict = json.loads(line.strip())
            text_tokenize_list = temp_dict["all_content_tokenize"]
            tag = temp_dict["tag"].strip()
            if not (tag in tag2index_dict):
                tag2index_dict[tag] = tag_index_count
                tag_index_count += 1
            x_list.append([[trans2index(word2index_dict,word) for word in word_list] for word_list in text_tokenize_list])
            y_list.append(tag2index_dict[tag])
    y_npa = np.array(y_list,dtype=np.uint8)

    print("x_list[:1]:{}".format(x_list[:1]))
    print("y_list[:1]:{}".format(y_list[:1]))

    #写入文件
    with open(os.path.join(APP_DIR,"data/tag2index.json"),"w",encoding="utf8") as f:
        json.dump(tag2index_dict,f,ensure_ascii=False)

    if not x_max_length:
        x_max_length0 = np.max(np.array([len(v) for v in x_list]))
        x_max_length = int(np.max(np.percentile(np.array([len(v) for v in x_list]),99.7)))
        print("数据集中最长的句子长度为:{},设定的最长的句子长度为:{}".format(x_max_length0,x_max_length))
    
    for i in range(len(x_list)):
        x_list[i] = tf.keras.preprocessing.sequence.pad_sequences(x_list[i],maxlen=x_max_length,dtype=np.int32,truncating="post", padding='post',value=0)
    x_npa = np.array(x_list,dtype=np.int32)

    x_npa,y_npa = sklearn.utils.shuffle(x_npa,y_npa,random_state=0)
    print("x_npa[:1]:{}".format(x_npa[:1]))
    print("y_npa[:1]:{}".format(y_npa[:1]))
    print("x_npa.shape = {}".format(x_npa.shape))
    print("y_npa.shape = {}".format(y_npa.shape))

    return x_npa,y_npa,tag2index_dict

In [5]:
import os
import time
import collections
import sklearn.model_selection
import gensim
import numpy as np
import tensorflow as tf
#from utils import trans_gensim_word2vec2tf_embedding,trans_data2tf_data,trans_tokenize_data2tf_data,trans_multi_input_tokenize_data2npa
#from model_multi_textcnn import build_model

APP_DIR  = os.path.dirname(os.path.realpath('__file__'))
if not os.path.exists(os.path.join(APP_DIR,"data")):
    os.makedirs(os.path.join(APP_DIR,"data"))

def split_train_eval_test_dataset(dataset):
    """区分训练验证测试集
    """
    dataset_size = tf.data.experimental.cardinality(dataset).numpy()
    print("总共有数据{}条".format(dataset_size))
    dataset = dataset.shuffle(dataset_size,seed=1)
    train_size = int(0.6 * dataset_size)
    eval_size = int(0.2 * dataset_size)
    test_size = int(0.2 * dataset_size)

    train_dataset = dataset.take(train_size)
    test_dataset = dataset.skip(train_size)
    eval_dataset = test_dataset.skip(eval_size)
    test_dataset = test_dataset.take(test_size)
    return train_dataset.prefetch(tf.data.experimental.AUTOTUNE), \
        eval_dataset.prefetch(tf.data.experimental.AUTOTUNE), \
        test_dataset.prefetch(tf.data.experimental.AUTOTUNE)

def split_train_eval_test_npa(x_npa,y_npa):
    x_train,x_test,y_train,y_test =  sklearn.model_selection.train_test_split(x_npa, y_npa, test_size=0.2, random_state=24)
    return x_train,y_train,x_test,y_test

def build_model_callback():
    callback_path = os.path.join(APP_DIR,"model_callback","weights.{epoch:02d}-{val_loss:.2f}.hdf5")
    if not os.path.exists(os.path.dirname(callback_path)):
        os.makedirs(os.path.dirname(callback_path))

    model_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=callback_path,
        monitor='val_loss',
        verbose=1,
        save_best_only=False,
        save_weights_only=True,
        mode='auto',
        save_freq='epoch',
    )

    tf_board_dir = os.path.join(APP_DIR,"model_tensorboard")
    if not os.path.exists(os.path.dirname(tf_board_dir)):
         os.makedirs(os.path.dirname(tf_board_dir))

    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tf_board_dir,histogram_freq=1,update_freq='batch')
    return model_callback,tensorboard_callback


if __name__ == "__main__":

    input_number = 10
    #句子的最大长度
    sentence_maxlen = 512
    #训练次数
    epochs = 7
    #批大小
    batch_size = 64
    #学习率
    learning_rate = 1e-3
    # learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(
    #     1e-3, decay_steps=100, decay_rate=0.1, staircase=False
    # )

    #embedding layer是否参加训练
    is_embedding_training = False
    # is_embedding_training = True
    #embedding的dropout比率
    #embedding_dropout_rate = 0.35
    embedding_dropout_rate = 0.2


    #数据文件
    #data_csv = os.path.join(APP_DIR,"data","最终数据集.csv")
    #data_jsonl = os.path.join(APP_DIR,"data","最终数据集tokenize.jsonl")
    #data_jsonl = "/mnt1/zhaodachuan/data/predict_user_attribute20200911/raw_data/user_profile/age/最终数据集tokenize.jsonl"
    data_jsonl = r"/mnt/d/zourui/predict_user_attribute20201214/user_attribute20201231/user_profile/gender/最终数据集tokenize.jsonl"
    #word2vec路径
    #word2vector_file_path = os.path.join(APP_DIR,"data","word2vector.bin")
    word2vector_file_path = r"/home/zourui/data/dim256/word2vector.bin"
    #模型保存路径
    model_path = os.path.join(APP_DIR,"model_output2")
    ###以上是需要修改的部分

    if not os.path.exists(os.path.dirname(model_path)):
        os.makedirs(os.path.dirname(model_path))

    #导入gensim的word2vector
    embedding_matrix,word2vector_dict,word2index_dict = trans_gensim_word2vec2tf_embedding(word2vector_file_path)
    vocab_size,embedding_dim = embedding_matrix.shape

    #划分训练集+验证集，测试集
    #x_npa,y_npa,tag2index_dict = trans_data2tf_data(data_csv,sentence_maxlen,word2index_dict)
    x_npa,y_npa,tag2index_dict = trans_multi_input_tokenize_data2npa(data_jsonl,sentence_maxlen,word2index_dict)
    class_weight_dict = {tag:np.sqrt(len(y_npa)/number) for tag,number in enumerate(np.bincount(y_npa))}
    x_train_eval,y_train_eval,x_test,y_test = split_train_eval_test_npa(x_npa,y_npa)
    print(x_train_eval[:3])
    print(y_train_eval[:3])
    print(class_weight_dict)
    print(collections.Counter(y_train_eval))
    print(collections.Counter(y_test))
    tag_size = len(tag2index_dict)  

x_list[:1]:[[[1696, 18263, 2351, 433, 11, 19592, 12277, 689, 1711, 129, 0, 1, 5936, 5432, 9802, 112, 3533, 112, 1, 112, 3533, 112, 28247, 44095, 427, 18, 1, 11, 137, 12487, 485, 4979, 483, 485, 13348, 19592, 12277, 6124, 5013, 149, 491, 493, 8, 162, 495, 4256, 432, 11, 22112, 19592, 12277, 22112, 112, 850, 6498, 7913, 244, 1, 5831, 72257, 18, 483, 2705, 495, 895, 9837, 52992, 11, 234, 19592, 41044, 8, 6405, 283, 9786, 1552, 81, 27893, 11, 1732, 7727, 1538, 237, 483, 485, 13348, 19592, 12277, 6124, 5013, 149, 491, 493, 8, 162, 495, 67, 55, 741, 4355, 12, 18394, 81, 4488, 8, 944, 1400, 186, 2292, 57, 8, 162, 18, 4326, 7143, 2, 1, 47559, 475, 2388, 11, 19592, 12277, 1696, 283, 9786, 1552, 81, 27893, 433, 11, 1087, 3191, 1688, 18394, 8, 283, 1696, 223, 1083, 20144, 11, 32, 6, 1566, 19592, 12277, 750, 8, 810, 25507, 18, 1696, 18263, 1699, 179, 27893, 2582, 54, 1675, 325, 650, 1, 1696, 7968, 9786, 1281, 1322, 1494, 179, 1, 1, 1, 8, 483, 1, 495, 109, 53455, 162, 11, 46654, 1696, 81, 28746, 11

In [1]:
import json
import gensim
import os
import LAC
import tensorflow as tf
import numpy as np
import pandas as pd
import sklearn.utils
import time
import collections
import sklearn.model_selection

APP_DIR  = os.path.dirname(os.path.realpath('__file__'))
class Number_Transform(object):
    
    def trans_gensim_word2vec2tf_embedding(self,word2vector_file_path):
        """把gensim的word2vec结果转化为tf.keras.layers.Embedding需要的结果
        """
        word2vec_model = gensim.models.Word2Vec.load(word2vector_file_path)
        #所有的词
        word_list = [word for word, word_info in word2vec_model.wv.vocab.items()]
        #词到index的映射
        word2index_dict = {"<PADDING>": 0, "<UNK>":1}
        #保存特殊词的padding
        specical_word_count = len(word2index_dict)
        #词到词向量的映射
        word2vector_dict = {}
        #初始化embeddings_matrix
        embeddings_matrix = np.zeros((len(word_list) + specical_word_count, word2vec_model.vector_size))
        #初始化unk为-1,1分布
        embeddings_matrix[word2index_dict["<UNK>"]] = (1 / np.sqrt(len(word_list) + specical_word_count) * (2 * np.random.rand(word2vec_model.vector_size) - 1))
        for i,word in enumerate(word_list):
            #从0开始
            word_index = i + specical_word_count
            word2index_dict[str(word)] = word_index
            word2vector_dict[str(word)] = word2vec_model.wv[word] # 词语：词向量
            embeddings_matrix[word_index] = word2vec_model.wv[word]  # 词向量矩阵
        #写入文件
        with open(os.path.join(APP_DIR,"data","word2index.json"),"w",encoding="utf8") as f:
            json.dump(word2index_dict,f,ensure_ascii=False)
        return embeddings_matrix,word2vector_dict,word2index_dict


    def trans2index(self,word2index_dict,word):
        """转换"""
        if word in word2index_dict:
            return word2index_dict[word]
        else:
            if "<UNK>" in word2index_dict:
                return word2index_dict["<UNK>"]
            else:
                raise ValueError("没有这个值，请检查")

    def trans_multi_input_tokenize_data2npa(self,data_file_path,x_max_length,word2index_dict):
        """把已经分好词的data文件转化为tf.data , 多输入版本
        """
        tag2index_dict = {}
        tag_index_count = len(tag2index_dict)
        x_list = []
        y_list = []
        with open(data_file_path) as f:
            for line in f:
                temp_dict = json.loads(line.strip())
                text_tokenize_list = temp_dict["all_content_tokenize"]
                tag = temp_dict["tag"].strip()
                if not (tag in tag2index_dict):
                    tag2index_dict[tag] = tag_index_count
                    tag_index_count += 1
                x_list.append([[Number_Transform.trans2index(self,word2index_dict,word) for word in word_list] for word_list in text_tokenize_list])
                y_list.append(tag2index_dict[tag])
        y_npa = np.array(y_list,dtype=np.uint8)

      #  print("x_list[:1]:{}".format(x_list[:1]))
      #  print("y_list[:1]:{}".format(y_list[:1]))

        #写入文件
        with open(os.path.join(APP_DIR,"data/tag2index.json"),"w",encoding="utf8") as f:
            json.dump(tag2index_dict,f,ensure_ascii=False)

        if not x_max_length:
            x_max_length0 = np.max(np.array([len(v) for v in x_list]))
            x_max_length = int(np.max(np.percentile(np.array([len(v) for v in x_list]),99.7)))
            print("数据集中最长的句子长度为:{},设定的最长的句子长度为:{}".format(x_max_length0,x_max_length))

        for i in range(len(x_list)):
            x_list[i] = tf.keras.preprocessing.sequence.pad_sequences(x_list[i],maxlen=x_max_length,dtype=np.int32,truncating="post", padding='post',value=0)
        x_npa = np.array(x_list,dtype=np.int32)

        x_npa,y_npa = sklearn.utils.shuffle(x_npa,y_npa,random_state=0)
    #     print("x_npa[:1]:{}".format(x_npa[:1]))
    #     print("y_npa[:1]:{}".format(y_npa[:1]))
    #     print("x_npa.shape = {}".format(x_npa.shape))
    #     print("y_npa.shape = {}".format(y_npa.shape))

        return x_npa,y_npa,tag2index_dict

    def out_x_y(self,input_number,sentence_maxlen,data_jsonl,word2vector_file_path):

        input_number = input_number
        #句子的最大长度
        sentence_maxlen = sentence_maxlen

       # data_jsonl = r"/mnt/d/zourui/predict_user_attribute20201214/age/raw_data/user_profile/position/最终数据集tokenize8.jsonl"
        #word2vec路径
        #word2vector_file_path = os.path.join(APP_DIR,"data","word2vector.bin")
        word2vector_file_path = word2vector_file_path

        #导入gensim的word2vector
        embedding_matrix,word2vector_dict,word2index_dict = Number_Transform.trans_gensim_word2vec2tf_embedding(self,word2vector_file_path)
        vocab_size,embedding_dim = embedding_matrix.shape

        #x_npa,y_npa,tag2index_dict = trans_data2tf_data(data_csv,sentence_maxlen,word2index_dict)
        x_npa,y_npa,tag2index_dict = Number_Transform.trans_multi_input_tokenize_data2npa(self,data_jsonl,sentence_maxlen,word2index_dict)
        class_weight_dict = {tag:np.sqrt(len(y_npa)/number) for tag,number in enumerate(np.bincount(y_npa))}
        tag_size = len(tag2index_dict)
        print('转换完毕！')
        return x_npa,y_npa

In [2]:
data_jsonl = "/mnt/d/zourui/predict_user_attribute20201214/user_attribute20201231/user_profile/gender/最终数据集tokenize.jsonl"
word2vector_file_path = r"/home/zourui/data/dim256/word2vector.bin"
input_number = 10
sentence_maxlen = 512
nt = Number_Transform()
x_npa,y_npa = nt.out_x_y(input_number,sentence_maxlen,data_jsonl,word2vector_file_path)

转换完毕！


In [3]:
import json
import os
import re
import collections
import numpy as np
import pandas as pd
import time
import tensorflow as tf
import LAC
from sklearn.feature_selection import RFE
import happybase
from sklearn.cluster import KMeans 
import joblib
import sklearn
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

class Gender_Model(object):
    def train(self,x_npa,y_npa):
        y_npa = pd.DataFrame(y_npa)
        y_npa.columns = ['gender']
        x_npa = x_npa.reshape(len(y_npa),5120)
        x_npa = pd.DataFrame(x_npa)
        x_npa.columns = [*["c{}".format(v) for v in range(5120)]]
        gender_data = pd.merge(y_npa,x_npa,on = y_npa.index)
        gender_data = gender_data.drop('key_0',axis = 1)
        temp_0 = gender_data.groupby(gender_data.index).filter(lambda x : float(x['gender'])==0)
        temp_1 = gender_data.groupby(gender_data.index).filter(lambda x : float(x['gender'])==1)
        temp_0 = temp_0.drop('gender',axis = 1)
        temp_1 = temp_1.drop('gender',axis = 1)
        #聚类
        model = KMeans(n_clusters=2)
        model.fit(temp_0)
        labels = model.predict(temp_0)
        labels = pd.DataFrame(labels)
        labels.columns = ['b']
        temp_00 = pd.merge(labels,temp_0,on = labels.index)
        temp_00 = temp_00.drop('key_0',axis=1)
        temp_000 = gender_data.groupby(gender_data.index).filter(lambda x : float(x['gender'])==0)
        temp_000 = temp_000['gender']
        temp_end_0 = pd.merge(temp_000,temp_00,on = temp_00.index)
        temp_end_0 = temp_end_0.drop('key_0',axis = 1)
        a = dict(collections.Counter(temp_end_0['b']))
        b = max(a,key = a.get)
        temp_end_0 = temp_end_0.groupby(temp_end_0.index).filter(lambda x:float(x['b'])==b)
        model = KMeans(n_clusters=2)
        model.fit(temp_1)
        labels = model.predict(temp_1)
        labels = pd.DataFrame(labels)
        labels.columns = ['b']
        temp_11 = pd.merge(labels,temp_1,on = labels.index)
        temp_11 = temp_11.drop('key_0',axis=1)
        temp_111 = gender_data.groupby(gender_data.index).filter(lambda x : float(x['gender'])==1)
        temp_111 = temp_111['gender']
        temp_end_1 = pd.merge(temp_111,temp_11,on = temp_11.index)
        temp_end_1 = temp_end_1.drop('key_0',axis = 1)
        a = dict(collections.Counter(temp_end_1['b']))
        b = max(a,key = a.get)
        temp_end_1 = temp_end_1.groupby(temp_end_1.index).filter(lambda x:float(x['b'])==b)
        x_npa1 = pd.concat([temp_end_0, temp_end_1], axis=0, ignore_index=True)
        x_npa1 = x_npa1.drop('b',axis = 1)
        x_data = x_npa1.drop('gender',axis = 1)
        y_data = x_npa1['gender']
        ss = StandardScaler()
        x_data = ss.fit_transform(x_data)
        x_data = pd.DataFrame(x_data)
        estimator = SVC(kernel = "linear")
        selector = RFE(estimator,n_features_to_select = 200,step = 64)
        selector = selector.fit(x_data,y_data)
        rank = selector.ranking_
        rank = pd.DataFrame(rank)
        rank.columns = ['rank']
        rank.to_csv(gender_rank,index = False)
        x_T = x_data.T
        rank_data = pd.merge(rank,x_T,on = rank.index)
        rank_data = rank_data.drop('key_0',axis = 1)
        rank_data_200 = rank_data.groupby(rank_data.index).filter(lambda x : float(x['rank'])==1)
        rank_data_200 = rank_data_200.drop('rank',axis = 1)
        rank_data_200 = rank_data_200.T
        all_data = pd.merge(y_data,rank_data_200,on = y_data.index)
        all_data = all_data.drop('key_0',axis = 1)
        data_x = all_data.drop('gender',axis = 1)
        data_y = all_data['gender']
        x_train_data, x_test_data, y_train_data, y_test_data = train_test_split(data_x, data_y, random_state=155)
        svm = SVC(C = 1,kernel = 'linear')
        svm.fit(x_train_data, y_train_data) 
        #lr.fit(x_train,y_train)
        preds = svm.predict(x_test_data)
        print('准确率为%f' %((preds==y_test_data).sum()/float(y_test_data.shape[0])))
        return svm
    
    def save(self,svm):
        joblib.dump(svm, gender_model)

In [4]:
gender_rank = '/home/zourui/data/predict_user_attribute20200911/raw_data/age/rank/gender_rank.csv'
gender_model = '/home/zourui/data/predict_user_attribute20200911/raw_data/gender/model_svm/gender_model'
model = Gender_Model()
model.train(x_npa,y_npa)

NameError: name 'train_test_split' is not defined

In [7]:
y_npa = pd.DataFrame(y_npa)
y_npa.columns = ['gender']
x_npa = x_npa.reshape(len(y_npa),5120)
x_npa = pd.DataFrame(x_npa)
x_npa.columns = [*["c{}".format(v) for v in range(5120)]]
x_npa

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,...,c5110,c5111,c5112,c5113,c5114,c5115,c5116,c5117,c5118,c5119
0,47,1,62,3374,401,21794,68756,1032,313,14987,...,18888,1022,244,10013,29704,11829,5064,576,6529,4554
1,3907,81,5256,61238,325,0,55,871,8,8315,...,21145,48850,98,11,1304,126,6,541,2467,567
2,88,2357,10781,179,2012,10781,11,34,6663,14281,...,34,8,172,2006,243,12629,15218,82,73020,514
3,1101,1379,55,59914,57,0,7677,27113,908,18090,...,0,0,0,0,0,0,0,0,0,0
4,14408,1583,2327,47199,6870,1046,284,325,0,31305,...,11,13030,8,86045,2003,54,1340,8,22152,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5668,12279,535,30032,19181,129,0,13097,4563,27162,14256,...,19659,2732,8,10133,11,8315,98,54,1,2223
5669,34,19863,2970,11,8801,906,362,906,325,0,...,5213,11,15220,612,5751,781,1011,129,17366,18
5670,1,8626,483,109374,495,35053,129,10605,70896,11,...,0,0,0,0,0,0,0,0,0,0
5671,1,8626,483,109374,495,35053,129,10605,70896,11,...,112,1,98,11,281,40,37223,2476,55,4166


In [8]:
gender_data = pd.merge(y_npa,x_npa,on = y_npa.index)
gender_data = gender_data.drop('key_0',axis = 1)
gender_data

Unnamed: 0,gender,c0,c1,c2,c3,c4,c5,c6,c7,c8,...,c5110,c5111,c5112,c5113,c5114,c5115,c5116,c5117,c5118,c5119
0,0,47,1,62,3374,401,21794,68756,1032,313,...,18888,1022,244,10013,29704,11829,5064,576,6529,4554
1,1,3907,81,5256,61238,325,0,55,871,8,...,21145,48850,98,11,1304,126,6,541,2467,567
2,0,88,2357,10781,179,2012,10781,11,34,6663,...,34,8,172,2006,243,12629,15218,82,73020,514
3,1,1101,1379,55,59914,57,0,7677,27113,908,...,0,0,0,0,0,0,0,0,0,0
4,0,14408,1583,2327,47199,6870,1046,284,325,0,...,11,13030,8,86045,2003,54,1340,8,22152,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5668,1,12279,535,30032,19181,129,0,13097,4563,27162,...,19659,2732,8,10133,11,8315,98,54,1,2223
5669,1,34,19863,2970,11,8801,906,362,906,325,...,5213,11,15220,612,5751,781,1011,129,17366,18
5670,0,1,8626,483,109374,495,35053,129,10605,70896,...,0,0,0,0,0,0,0,0,0,0
5671,0,1,8626,483,109374,495,35053,129,10605,70896,...,112,1,98,11,281,40,37223,2476,55,4166


In [9]:
temp_0 = gender_data.groupby(gender_data.index).filter(lambda x : float(x['gender'])==0)
temp_1 = gender_data.groupby(gender_data.index).filter(lambda x : float(x['gender'])==1)

In [10]:
temp_0 = temp_0.drop('gender',axis = 1)
temp_1 = temp_1.drop('gender',axis = 1)

In [15]:
#聚类
model = KMeans(n_clusters=2)
model.fit(temp_0)
labels = model.predict(temp_0)
labels = pd.DataFrame(labels)
labels.columns = ['b']
temp_00 = pd.merge(labels,temp_0,on = labels.index)
temp_00 = temp_00.drop('key_0',axis=1)
temp_000 = gender_data.groupby(gender_data.index).filter(lambda x : float(x['gender'])==0)
temp_000 = temp_000['gender']
temp_end_0 = pd.merge(temp_000,temp_00,on = temp_00.index)
temp_end_0 = temp_end_0.drop('key_0',axis = 1)
a = dict(collections.Counter(temp_end_0['b']))
b = max(a,key = a.get)
temp_end_0 = temp_end_0.groupby(temp_end_0.index).filter(lambda x:float(x['b'])==b)
temp_end_0

KMeans(n_clusters=2)

Unnamed: 0_level_0,gender,c0,c1,c2,c3,c4,c5,c6,c7,c8,...,c5110,c5111,c5112,c5113,c5114,c5115,c5116,c5117,c5118,c5119
b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3959,3959,3959,3959,3959,3959,3959,3959,3959,3959,...,3959,3959,3959,3959,3959,3959,3959,3959,3959,3959
1,620,620,620,620,620,620,620,620,620,620,...,620,620,620,620,620,620,620,620,620,620


Unnamed: 0,gender,b,c0,c1,c2,c3,c4,c5,c6,c7,...,c5110,c5111,c5112,c5113,c5114,c5115,c5116,c5117,c5118,c5119
0,0,0,47,1,62,3374,401,21794,68756,1032,...,18888,1022,244,10013,29704,11829,5064,576,6529,4554
1,0,0,88,2357,10781,179,2012,10781,11,34,...,34,8,172,2006,243,12629,15218,82,73020,514
2,0,0,14408,1583,2327,47199,6870,1046,284,325,...,11,13030,8,86045,2003,54,1340,8,22152,11
3,0,0,40009,68250,11,13861,17849,2877,129,274,...,1,82,1,82,103128,82,1,82,1,38888
4,0,0,5689,11033,11,21659,112,44192,0,4719,...,54,4433,54150,650,11,18260,235,1,1,3332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4574,0,0,88,2357,10781,179,2012,10781,11,34,...,2377,1,36754,8,55,5476,57,1326,11,127
4575,0,0,51742,11,888,19350,1148,2842,11,15747,...,8,2611,8509,1999,700,211,129,4983,11,84
4576,0,0,1,8626,483,109374,495,35053,129,10605,...,0,0,0,0,0,0,0,0,0,0
4577,0,0,1,8626,483,109374,495,35053,129,10605,...,112,1,98,11,281,40,37223,2476,55,4166


In [18]:
model = KMeans(n_clusters=2)
model.fit(temp_1)
labels = model.predict(temp_1)
labels = pd.DataFrame(labels)
labels.columns = ['b']
temp_11 = pd.merge(labels,temp_1,on = labels.index)
temp_11 = temp_11.drop('key_0',axis=1)
temp_111 = gender_data.groupby(gender_data.index).filter(lambda x : float(x['gender'])==1)
temp_111 = temp_111['gender']
temp_end_1 = pd.merge(temp_111,temp_11,on = temp_11.index)
temp_end_1 = temp_end_1.drop('key_0',axis = 1)
a = dict(collections.Counter(temp_end_1['b']))
b = max(a,key = a.get)
temp_end_1 = temp_end_1.groupby(temp_end_1.index).filter(lambda x:float(x['b'])==b)
temp_end_1

KMeans(n_clusters=2)

Unnamed: 0,gender,b,c0,c1,c2,c3,c4,c5,c6,c7,...,c5110,c5111,c5112,c5113,c5114,c5115,c5116,c5117,c5118,c5119
0,1,0,3907,81,5256,61238,325,0,55,871,...,21145,48850,98,11,1304,126,6,541,2467,567
1,1,0,1101,1379,55,59914,57,0,7677,27113,...,0,0,0,0,0,0,0,0,0,0
2,1,0,29940,11,15190,12726,378,8381,8,14336,...,40,1322,44166,45,3973,11,18839,42401,18,1
3,1,0,1,2,12041,8,5658,1968,0,1041,...,109,1,1674,17142,3413,67,16601,8,55,1
4,1,0,31777,13887,16212,1,11,9459,43435,6,...,24509,84,1239,4839,9261,11,87122,8,272,7795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1089,1,0,35478,46261,34142,12756,11,127,61256,362,...,1,495,67,8,1,244,507,146,11,11367
1090,1,0,6938,2309,11,244,34,2415,9399,2327,...,25677,2177,892,5375,1,11,40,13543,389,19933
1091,1,0,1,1620,11,26303,2205,54,44662,34846,...,1630,119,112,8909,112,8,38,12621,128,877
1092,1,0,12279,535,30032,19181,129,0,13097,4563,...,19659,2732,8,10133,11,8315,98,54,1,2223


In [20]:
x_npa1 = pd.concat([temp_end_0, temp_end_1], axis=0, ignore_index=True)
x_npa1 = x_npa1.drop('b',axis = 1)
x_npa1

Unnamed: 0,gender,c0,c1,c2,c3,c4,c5,c6,c7,c8,...,c5110,c5111,c5112,c5113,c5114,c5115,c5116,c5117,c5118,c5119
0,0,47,1,62,3374,401,21794,68756,1032,313,...,18888,1022,244,10013,29704,11829,5064,576,6529,4554
1,0,88,2357,10781,179,2012,10781,11,34,6663,...,34,8,172,2006,243,12629,15218,82,73020,514
2,0,14408,1583,2327,47199,6870,1046,284,325,0,...,11,13030,8,86045,2003,54,1340,8,22152,11
3,0,40009,68250,11,13861,17849,2877,129,274,25742,...,1,82,1,82,103128,82,1,82,1,38888
4,0,5689,11033,11,21659,112,44192,0,4719,112,...,54,4433,54150,650,11,18260,235,1,1,3332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5005,1,35478,46261,34142,12756,11,127,61256,362,98,...,1,495,67,8,1,244,507,146,11,11367
5006,1,6938,2309,11,244,34,2415,9399,2327,2224,...,25677,2177,892,5375,1,11,40,13543,389,19933
5007,1,1,1620,11,26303,2205,54,44662,34846,0,...,1630,119,112,8909,112,8,38,12621,128,877
5008,1,12279,535,30032,19181,129,0,13097,4563,27162,...,19659,2732,8,10133,11,8315,98,54,1,2223


In [21]:
x_data = x_npa1.drop('gender',axis = 1)
y_data = x_npa1['gender']

In [22]:
ss = StandardScaler()
x_data = ss.fit_transform(x_data)
x_data = pd.DataFrame(x_data)
x_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5110,5111,5112,5113,5114,5115,5116,5117,5118,5119
0,-0.610516,-0.518549,-0.452059,-0.265185,-0.397303,1.114940,3.666932,-0.400477,-0.562985,0.369841,...,1.092467,-0.351205,-0.393607,0.313630,1.553586,0.290339,-0.092980,-0.385196,0.134261,-0.097083
1,-0.608295,-0.383394,0.307817,-0.462313,-0.302614,0.299457,-0.546535,-0.473610,-0.074286,0.325373,...,-0.423552,-0.414728,-0.397494,-0.275338,-0.400928,0.335066,0.530567,-0.416553,5.362681,-0.377197
2,0.167434,-0.427796,-0.291492,2.438769,-0.017078,-0.421393,-0.529803,-0.452286,-0.587074,1.397641,...,-0.425402,0.401038,-0.406348,5.906297,-0.284166,-0.367994,-0.321668,-0.421250,1.362752,-0.412072
3,1.554265,3.396652,-0.455674,0.381851,0.628229,-0.285813,-0.539303,-0.456023,1.394042,-0.573621,...,-0.426206,-0.410092,-0.406726,-0.416862,6.424712,-0.366428,-0.403894,-0.416553,-0.379059,2.283465
4,-0.304883,0.114317,-0.455674,0.862979,-0.414289,2.773452,-0.547210,-0.130294,-0.578454,-0.424471,...,-0.421944,-0.137523,2.516734,-0.375081,-0.416320,0.649892,-0.389525,-0.421694,-0.379059,-0.181811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5005,1.308816,2.135222,1.963891,0.313674,-0.420226,-0.489443,3.207248,-0.449574,-0.579532,4.059415,...,-0.426206,-0.384219,-0.403163,-0.422305,-0.416983,-0.357371,-0.372821,-0.412490,-0.378272,0.375296
5006,-0.237224,-0.386148,-0.455674,-0.458303,-0.418874,-0.320022,0.028867,-0.305579,-0.415914,-0.189471,...,1.638359,-0.278850,-0.358622,-0.027526,-0.416983,-0.370398,-0.401499,0.437887,-0.348549,0.969220
5007,-0.613008,-0.425673,-0.455674,1.149509,-0.291270,-0.494848,2.190181,2.077407,-0.587074,-0.570661,...,-0.295221,-0.407774,-0.400733,0.232424,-0.409619,-0.370566,-0.401622,0.379363,-0.369072,-0.352028
5008,0.052104,-0.487916,1.672531,0.710089,-0.413290,-0.498847,0.255522,-0.141726,1.503326,0.323799,...,1.154461,-0.244082,-0.406348,0.322457,-0.416320,0.093873,-0.397938,-0.418330,-0.379059,-0.258703


In [23]:
estimator = SVC(kernel = "linear")
selector = RFE(estimator,n_features_to_select = 200,step = 64)
selector = selector.fit(x_data,y_data)
rank = selector.ranking_
rank = pd.DataFrame(rank)

In [24]:
rank.columns = ['rank']
rank.to_csv('/home/zourui/data/predict_user_attribute20200911/raw_data/age/rank/gender_rank.csv',index = False)

In [25]:
rank

Unnamed: 0,rank
0,56
1,58
2,25
3,41
4,39
...,...
5115,14
5116,10
5117,35
5118,68


In [26]:
x_T = x_data.T
rank_data = pd.merge(rank,x_T,on = rank.index)
rank_data = rank_data.drop('key_0',axis = 1)
rank_data

Unnamed: 0,rank,0,1,2,3,4,5,6,7,8,...,5000,5001,5002,5003,5004,5005,5006,5007,5008,5009
0,56,-0.610516,-0.608295,0.167434,1.554265,-0.304883,-0.608295,0.211420,3.379665,-0.613008,...,-0.610083,0.069330,-0.452499,-0.613008,-0.241070,1.308816,-0.237224,-0.613008,0.052104,-0.611220
1,58,-0.518549,-0.383394,-0.427796,3.396652,0.114317,-0.383394,0.112711,-0.124499,-0.518148,...,-0.412995,1.931341,-0.517976,0.192852,-0.517918,2.135222,-0.386148,-0.425673,-0.487916,0.620863
2,25,-0.452059,0.307817,-0.291492,-0.455674,-0.455674,0.307817,-0.447309,-0.382302,-0.142551,...,1.424556,-0.456312,-0.431075,-0.456383,-0.080309,1.963891,-0.455674,-0.455674,1.672531,-0.245909
3,41,-0.265185,-0.462313,2.438769,0.381851,0.862979,-0.462313,-0.444729,-0.270429,-0.473234,...,-0.472678,-0.456575,-0.058494,-0.453305,-0.291962,0.313674,-0.458303,1.149509,0.710089,-0.472678
4,39,-0.397303,-0.302614,-0.017078,0.628229,-0.414289,-0.302614,1.983672,-0.401770,4.710487,...,-0.312723,0.821779,-0.393600,-0.420872,-0.420402,-0.420226,-0.418874,-0.291270,-0.413290,0.096419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5115,14,0.290339,0.335066,-0.367994,-0.366428,0.649892,-0.328186,-0.370006,-0.371013,-0.355135,...,-0.370006,-0.293858,-0.310295,-0.359216,-0.116737,-0.357371,-0.370398,-0.370566,0.093873,-0.327348
5116,10,-0.092980,0.530567,-0.321668,-0.403894,-0.389525,0.674817,0.546042,-0.403956,-0.403464,...,0.010126,-0.388972,-0.387744,-0.396034,-0.119755,-0.372821,-0.401499,-0.401622,-0.397938,-0.341871
5117,35,-0.385196,-0.416553,-0.421250,-0.416553,-0.421694,-0.348825,3.604159,-0.421758,-0.172998,...,2.883779,-0.397193,-0.410713,-0.105461,-0.421060,-0.412490,0.437887,0.379363,-0.418330,-0.413570
5118,68,0.134261,5.362681,1.362752,-0.379059,-0.379059,1.058912,0.287282,-0.379137,-0.099045,...,-0.378508,-0.378272,-0.378272,-0.378272,-0.168478,-0.378272,-0.348549,-0.369072,-0.379059,0.986412


In [27]:
rank_data_200 = rank_data.groupby(rank_data.index).filter(lambda x : float(x['rank'])==1)
rank_data_200 = rank_data_200.drop('rank',axis = 1)
rank_data_200 = rank_data_200.T
rank_data_200

Unnamed: 0,17,23,40,65,66,67,92,94,106,110,...,4923,4925,4926,4979,4990,5005,5047,5065,5066,5104
0,-0.353833,-0.011943,1.669264,0.192065,-0.455865,-0.578191,-0.384625,-0.410590,0.169195,-0.442192,...,-0.387930,-0.376418,-0.362028,-0.435021,-0.375088,-0.165503,-0.440385,-0.395481,1.325651,-0.355551
1,-0.423837,-0.352634,-0.379972,3.192932,-0.459455,0.769191,0.411713,-0.006677,-0.354917,-0.441089,...,0.409749,-0.253760,-0.383963,-0.443817,0.856879,-0.389722,-0.439958,-0.404168,4.656676,-0.365593
2,-0.545760,0.436461,-0.414150,-0.456264,0.177456,-0.421243,-0.389302,-0.371292,-0.233455,-0.398126,...,-0.390593,-0.365236,-0.384180,2.682007,-0.272430,-0.287306,-0.440385,-0.391137,-0.339940,-0.361928
3,-0.554240,-0.102187,-0.413629,-0.425594,-0.366637,-0.526933,-0.388887,-0.354065,-0.382702,-0.450629,...,-0.394027,-0.419820,-0.391059,-0.135868,-0.351779,0.184289,-0.386650,-0.404389,-0.355737,-0.253945
4,-0.501456,-0.374160,-0.414150,-0.020804,-0.459953,-0.572966,-0.385099,0.264963,-0.373969,-0.242433,...,1.377112,-0.271722,-0.391795,-0.444576,-0.395889,-0.395340,-0.379696,-0.395481,4.525393,-0.363404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5005,1.889066,-0.370600,-0.305103,1.347605,-0.257370,1.679064,-0.364558,-0.429315,0.481820,1.416583,...,-0.364029,2.936814,-0.368474,-0.418793,-0.383959,-0.370267,-0.321020,1.113245,6.410515,-0.365736
5006,0.292991,-0.365798,-0.390861,-0.456891,-0.447690,-0.404245,-0.358580,-0.427018,-0.382226,-0.288317,...,-0.391855,-0.416116,4.411126,-0.442073,-0.388242,-0.398628,0.568819,-0.403653,-0.366087,-0.277360
5007,-0.554154,1.157748,-0.337770,-0.014469,1.747277,-0.578389,-0.311402,-0.403150,0.317331,-0.449526,...,1.242893,-0.411573,-0.386992,-0.443817,-0.266312,-0.396504,-0.433310,-0.385983,-0.182431,-0.360786
5008,-0.555105,-0.370352,0.052571,-0.338477,-0.448189,-0.146965,-0.149210,-0.425121,-0.256080,-0.385146,...,-0.397952,0.467021,-0.391362,0.323743,1.597883,3.185463,-0.420257,-0.381051,-0.137295,-0.340417


In [28]:
all_data = pd.merge(y_data,rank_data_200,on = y_data.index)
all_data = all_data.drop('key_0',axis = 1)
all_data

Unnamed: 0,gender,17,23,40,65,66,67,92,94,106,...,4923,4925,4926,4979,4990,5005,5047,5065,5066,5104
0,0,-0.353833,-0.011943,1.669264,0.192065,-0.455865,-0.578191,-0.384625,-0.410590,0.169195,...,-0.387930,-0.376418,-0.362028,-0.435021,-0.375088,-0.165503,-0.440385,-0.395481,1.325651,-0.355551
1,0,-0.423837,-0.352634,-0.379972,3.192932,-0.459455,0.769191,0.411713,-0.006677,-0.354917,...,0.409749,-0.253760,-0.383963,-0.443817,0.856879,-0.389722,-0.439958,-0.404168,4.656676,-0.365593
2,0,-0.545760,0.436461,-0.414150,-0.456264,0.177456,-0.421243,-0.389302,-0.371292,-0.233455,...,-0.390593,-0.365236,-0.384180,2.682007,-0.272430,-0.287306,-0.440385,-0.391137,-0.339940,-0.361928
3,0,-0.554240,-0.102187,-0.413629,-0.425594,-0.366637,-0.526933,-0.388887,-0.354065,-0.382702,...,-0.394027,-0.419820,-0.391059,-0.135868,-0.351779,0.184289,-0.386650,-0.404389,-0.355737,-0.253945
4,0,-0.501456,-0.374160,-0.414150,-0.020804,-0.459953,-0.572966,-0.385099,0.264963,-0.373969,...,1.377112,-0.271722,-0.391795,-0.444576,-0.395889,-0.395340,-0.379696,-0.395481,4.525393,-0.363404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5005,1,1.889066,-0.370600,-0.305103,1.347605,-0.257370,1.679064,-0.364558,-0.429315,0.481820,...,-0.364029,2.936814,-0.368474,-0.418793,-0.383959,-0.370267,-0.321020,1.113245,6.410515,-0.365736
5006,1,0.292991,-0.365798,-0.390861,-0.456891,-0.447690,-0.404245,-0.358580,-0.427018,-0.382226,...,-0.391855,-0.416116,4.411126,-0.442073,-0.388242,-0.398628,0.568819,-0.403653,-0.366087,-0.277360
5007,1,-0.554154,1.157748,-0.337770,-0.014469,1.747277,-0.578389,-0.311402,-0.403150,0.317331,...,1.242893,-0.411573,-0.386992,-0.443817,-0.266312,-0.396504,-0.433310,-0.385983,-0.182431,-0.360786
5008,1,-0.555105,-0.370352,0.052571,-0.338477,-0.448189,-0.146965,-0.149210,-0.425121,-0.256080,...,-0.397952,0.467021,-0.391362,0.323743,1.597883,3.185463,-0.420257,-0.381051,-0.137295,-0.340417


In [29]:
data_x = all_data.drop('gender',axis = 1)
data_y = all_data['gender']

In [30]:
from sklearn.model_selection import train_test_split
x_train_data, x_test_data, y_train_data, y_test_data = train_test_split(data_x, data_y, random_state=155)

In [33]:
svm = SVC(C = 1,kernel = 'linear')
svm.fit(x_train_data, y_train_data) 
#lr.fit(x_train,y_train)
preds = svm.predict(x_test_data)
print('准确率为%f' %((preds==y_test_data).sum()/float(y_test_data.shape[0])))
joblib.dump(svm, '/home/zourui/data/predict_user_attribute20200911/raw_data/gender/model_svm/gender_model')

SVC(C=1, kernel='linear')

准确率为0.822825


['/home/zourui/data/predict_user_attribute20200911/raw_data/gender/model_svm/gender_model']

In [34]:
preds.tolist()

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [40]:
a = dict(collections.Counter(y_test_data))

In [41]:
a

{0: 994, 1: 259}

In [9]:
y_train_data = y_train_data['gender']
y_test_data = y_test_data['gender']

In [10]:
y_train_data.tolist()

[0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,


In [15]:
def svm(C):
    val = cross_val_score(
        SVC(C=C
           ),
        x_train_data, y_train_data, scoring='accuracy',cv=10).mean()
    return val
t_start = time.time()
params = {
    'C': (0.01,1)
    }
svm_bo = BayesianOptimization(svm,params)
svm_bo.maximize(init_points = 5,n_iter =5)
params_1 = svm_bo
params = svm_bo.max
params = params["params"]
L = list()
L.append(params)
val = svm(**params)
print(params)
print(val)
t_end = time.time()
print(t_end - t_start)
svm_classifier=SVC(C=L[0]['C'],kernel='linear')

# 拟合模型
svm_classifier.fit(x_train_data, y_train_data)

# 使用模型预测
preds = svm_classifier.predict(x_test_data)

print('准确率为%f' %((preds==y_test_data).sum()/float(y_test_data.shape[0])))
#joblib.dump(svm_classifier, '../code/model_svm')

|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.8089  [0m | [0m 0.945   [0m |
| [0m 2       [0m | [0m 0.8089  [0m | [0m 0.7257  [0m |
| [0m 3       [0m | [0m 0.8089  [0m | [0m 0.4749  [0m |
| [0m 4       [0m | [0m 0.8089  [0m | [0m 0.3064  [0m |
| [0m 5       [0m | [0m 0.8089  [0m | [0m 0.9188  [0m |
| [0m 6       [0m | [0m 0.8089  [0m | [0m 0.02039 [0m |
| [0m 7       [0m | [0m 0.8089  [0m | [0m 0.1749  [0m |
| [0m 8       [0m | [0m 0.8089  [0m | [0m 0.08641 [0m |
| [0m 9       [0m | [0m 0.8089  [0m | [0m 0.4605  [0m |
| [0m 10      [0m | [0m 0.8089  [0m | [0m 0.5159  [0m |
{'C': 0.9449518524012581}
0.8088864954432478
6250.681229829788


SVC(C=0.9449518524012581, break_ties=False, cache_size=200, class_weight=None,
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

准确率为0.659619


In [11]:
svm = SVC(C = 500,kernel = 'linear')
svm.fit(x_train_data, y_train_data) 
#lr.fit(x_train,y_train)
preds = svm.predict(x_test_data)
print('准确率为%f' %((preds==y_test_data).sum()/float(y_test_data.shape[0])))

SVC(C=500, kernel='linear')

准确率为0.659619


In [12]:
svm = SVC(C = 1000,kernel = 'linear')
svm.fit(x_train_data, y_train_data) 
#lr.fit(x_train,y_train)
preds = svm.predict(x_test_data)
print('准确率为%f' %((preds==y_test_data).sum()/float(y_test_data.shape[0])))

SVC(C=1000, kernel='linear')

准确率为0.659619


In [13]:
svm = SVC(C = 1500,kernel = 'linear')
svm.fit(x_train_data, y_train_data) 
#lr.fit(x_train,y_train)
preds = svm.predict(x_test_data)
print('准确率为%f' %((preds==y_test_data).sum()/float(y_test_data.shape[0])))

SVC(C=1500, kernel='linear')

准确率为0.659619


In [14]:
svm = SVC(C = 0.01,kernel = 'linear')
svm.fit(x_train_data, y_train_data) 
#lr.fit(x_train,y_train)
preds = svm.predict(x_test_data)
print('准确率为%f' %((preds==y_test_data).sum()/float(y_test_data.shape[0])))

SVC(C=0.01, kernel='linear')

准确率为0.659619


In [15]:
svm = SVC(C = 100,kernel = 'linear')
svm.fit(x_train_data, y_train_data) 
#lr.fit(x_train,y_train)
preds = svm.predict(x_test_data)
print('准确率为%f' %((preds==y_test_data).sum()/float(y_test_data.shape[0])))

SVC(C=100, kernel='linear')

准确率为0.659619


In [20]:
joblib.dump(svm, '/home/zourui/data/predict_user_attribute20200911/raw_data/gender/model_svm/age_svm_model')

['/home/zourui/data/predict_user_attribute20200911/raw_data/gender/model_svm/age_svm_model']