# 基于gensim的doc2vec模型执行情感分类任务
*数据集:* IMDB

In [1]:
import sys
import numpy as np
import gensim
import glob
import os

from gensim.models.doc2vec import Doc2Vec,TaggedDocument, LabeledSentence
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split

LabeledSentence = gensim.models.doc2vec.LabeledSentence




In [27]:
##读取并预处理数据
def get_dataset():
    #读取数据
    pos_dir = "D:/sourcecode/ai-dataset/aclImdb/train/pos"
    neg_dir = "D:/sourcecode/ai-dataset/aclImdb/train/neg"
    unsup_dir = "D:/sourcecode/ai-dataset/aclImdb/train/unsup"

    #对英文做简单的数据清洗预处理，中文根据需要进行修改
    def cleanText(corpus):
        punctuation = """.,?!:;(){}[]"""
        corpus = [z.lower().replace('\n','') for z in corpus]
        corpus = [z.replace('<br />', ' ') for z in corpus]

        #treat punctuation as individual words
        for c in punctuation:
            corpus = [z.replace(c, ' %s '%c) for z in corpus]
        corpus = [z.split() for z in corpus]
        return corpus

    def read_dataset(rootdir):
        docs = []
        files = glob.glob(os.path.join(rootdir, '*.txt'))
        for fn in files[:500]:
            with open(fn, 'r', encoding='utf-8') as fin:
                doc = fin.read()
                justfn = os.path.split(fn)[-1].split('.')[0]
                doc_id, score = justfn.split('_')
                docs.append(TaggedDocument(simple_preprocess(doc), doc_id))
        return docs
    pos_reviews = read_dataset(pos_dir)
    neg_reviews = read_dataset(neg_dir)
    unsup_reviews = read_dataset(unsup_dir)
    #使用1表示正面情感，0为负面
    X = pos_reviews + neg_reviews
    y = np.concatenate((np.ones(len(pos_reviews)), np.zeros(len(neg_reviews))))
    #将数据分割为训练与测试集
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    del pos_reviews, neg_reviews
    return x_train,x_test,unsup_reviews,y_train, y_test


In [28]:
##读取向量
def getVecs(model, corpus, size):
    vecs = [np.array(model.docvecs[z.tags[0]]).reshape((1, size)) for z in corpus]
    return np.concatenate(vecs)


In [29]:
##对数据进行训练
def train(x_train,x_test,unsup_reviews,size = 100,epoch_num=10):
    
    #使用所有的数据建立词典
    all_reviews = x_train + x_test + unsup_reviews
    all_train_reviews = x_train + unsup_reviews

    #实例DM和DBOW模型
    model_dm = gensim.models.Doc2Vec(min_count=1, window=10, vector_size=size, sample=1e-3, negative=5, workers=8)
    model_dbow = gensim.models.Doc2Vec(min_count=1, window=10, vector_size=size, sample=1e-3, negative=5, dm=0, workers=8)

    model_dm.build_vocab(all_reviews)
    model_dbow.build_vocab(all_reviews)

    #进行多次重复训练，每一次都需要对训练数据重新打乱，以提高精度
    model_dm.train(all_train_reviews,total_examples=model_dm.corpus_count,epochs=epoch_num)
    model_dbow.train(all_train_reviews,total_examples=model_dbow.corpus_count,epochs=epoch_num)

    #训练测试数据集
    model_dm.train(x_test,total_examples=model_dm.corpus_count,epochs=epoch_num)
    model_dbow.train(x_test,total_examples=model_dbow.corpus_count,epochs=epoch_num)

    return model_dm,model_dbow


In [30]:
##将训练完成的数据转换为vectors
def get_vectors(model_dm,model_dbow):

    #获取训练数据集的文档向量
    train_vecs_dm = getVecs(model_dm, x_train, size)
    train_vecs_dbow = getVecs(model_dbow, x_train, size)
    train_vecs = np.hstack((train_vecs_dm, train_vecs_dbow))
    #获取测试数据集的文档向量
    test_vecs_dm = getVecs(model_dm, x_test, size)
    test_vecs_dbow = getVecs(model_dbow, x_test, size)
    test_vecs = np.hstack((test_vecs_dm, test_vecs_dbow))

    return train_vecs,test_vecs


In [31]:
##使用分类器对文本向量进行分类训练
def Classifier(train_vecs,y_train,test_vecs, y_test):
    #使用sklearn的SGD分类器
    from sklearn.linear_model import SGDClassifier

    lr = SGDClassifier(loss='log', penalty='l1')
    lr.fit(train_vecs, y_train)

    print('Test Accuracy: %.2f'%lr.score(test_vecs, y_test))

    return lr


In [32]:
##绘出ROC曲线，并计算AUC
def ROC_curve(lr,y_test):
    from sklearn.metrics import roc_curve, auc
    import matplotlib.pyplot as plt

    pred_probas = lr.predict_proba(test_vecs)[:,1]

    fpr,tpr,_ = roc_curve(y_test, pred_probas)
    roc_auc = auc(fpr,tpr)
    plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])

    plt.show()


In [None]:
%matplotlib inline
##运行模块
if __name__ == "__main__":
    #设置向量维度和训练次数
    size, epoch_num = 100, 40
    #获取训练与测试数据及其类别标注
    x_train,x_test,unsup_reviews,y_train, y_test = get_dataset()
    #对数据进行训练，获得模型
    model_dm,model_dbow = train(x_train,x_test,unsup_reviews,size,epoch_num)
    #从模型中抽取文档相应的向量
    train_vecs,test_vecs = get_vectors(model_dm,model_dbow)
    #使用文章所转换的向量进行情感正负分类训练
    lr=Classifier(train_vecs,y_train,test_vecs, y_test)
    #画出ROC曲线
    ROC_curve(lr,y_test)
