In [64]:
import jieba
import time
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer  
from scipy import io
import json

In [65]:
def logtime(func):
    """
    函数目的：测量函数运行时间 
    Parameter:
        func - 被测量的函数
    Return:
        wrapper - 被装饰之后的函数
    """
    def wrapper(*args,**kwargs):
        start = time.time()
        result = func(*args,**kwargs)
        end = time.time()
        print("完成函数{name}, 运行时间 {totaltime:.3f}s".format(name=func.__name__,totaltime=end-start))
        start = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(start))
        end = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(end))
        print("开始时间 : %s \n结束时间 : %s "%(start,end))
        return result
    return wrapper

In [66]:
def load_data(rawdata,n):
    """
    Purpose:加载原始数据，处理并输出
    
    """
    alldata = pd.read_csv(rawdata,header=None)
    alldata.columns = ["label","content"]
    data = alldata.sample(n)
    # content = data["content"]
    # label=data["label"]
    return data

In [67]:
class MessageCountVectorizer(sklearn.feature_extraction.text.CountVectorizer):
    def build_analyzer(self):
        def analyzer(doc):
            words = jieba.cut(doc)
            return words
        return analyzer

In [68]:
@logtime
def vect_data(content,label,path):
    """
    函数说明：得到每个短信的内容和标签的向量表示，同时保存特征词
    Return:
        vect_result - 短信的向量表示
        label - 标签的向量表示
        words - 词汇表
    Modify:
        2017-12-22
    
    """
    vect = 	MessageCountVectorizer(max_df=0.9,min_df=2)
    vect_result=vect.fit_transform(content)
    io.mmwrite(path.split('.csv')[0]+"_content_vector.mtx",vect_result)
    
    label = label.tolist()
    with open(path.split('.csv')[0]+'_label_vector.json', 'w') as f:
        json.dump(label, f)
        
    words = vect.get_feature_names_out()
    words = words.tolist()
    print("使用了%d条短信,词汇表长度:%s"%(len(label),len(words)))
    with open(path.split('.csv')[0]+'_feature_words.json', 'w') as f:
        json.dump(words, f)

In [69]:
def gen_vect(rawdata_path):
    rawdata_path
    df=pd.read_csv(rawdata_path)
    label=df['label']
    content=df['content']
    vect_data(content,label,rawdata_path)

In [70]:
def data_sample(rawdata1,rawdata2,rawdata3):
    """ 
    对原始数据进行采样,各采样10000条
    
    """
    
    data1=load_data(rawdata1,n=10000)
    data2=load_data(rawdata2,n=15000)
    data3=load_data(rawdata3,n=20000)
    with open('../data/sample1.csv', 'w') as f,open('../data/sample2.csv', 'w') as f1,open('../data/sample3.csv', 'w') as f2:
        data1.to_csv(f,index=False)
        data2.to_csv(f1,index=False)
        data3.to_csv(f2,index=False)
    return ['../data/sample1.csv','../data/sample2.csv','../data/sample3.csv']

In [71]:
def main():
    flist=['../rawdata/spam.csv','../rawdata/sms_pub1.csv','../rawdata/sms_pub2.csv']
    sample_list=data_sample(flist[0],flist[1],flist[2])
    for i in sample_list:
        gen_vect(i)        

In [72]:
if __name__ == "__main__":
    main()

  alldata = pd.read_csv(rawdata,header=None)
  alldata = pd.read_csv(rawdata,header=None)
  alldata = pd.read_csv(rawdata,header=None)


使用了10000条短信,词汇表长度:10166
完成函数vect_data, 运行时间 0.997s
开始时间 : 2024-03-20 09:12:37 
结束时间 : 2024-03-20 09:12:38 
使用了15000条短信,词汇表长度:13680
完成函数vect_data, 运行时间 1.376s
开始时间 : 2024-03-20 09:12:38 
结束时间 : 2024-03-20 09:12:39 
使用了20000条短信,词汇表长度:16566
完成函数vect_data, 运行时间 1.828s
开始时间 : 2024-03-20 09:12:39 
结束时间 : 2024-03-20 09:12:41 
