#导入数据集

In [2]:
# pandas是用来导入、整理、清洗表格数据的专用工具，类似excel，但功能更加强大，导入的时候给pandas起个小名叫pd
import pandas as pd

In [3]:
# 用pandas的read_csv函数读取训练数据及测试数据，数据文件是.tsv格式的，也就是说数据用制表符\t分隔，类似于.csv文件的数据用逗号分隔
data_train = pd.read_csv('D:\\CHENGXU\\lintcode\\dianying\\labeledTrainData.tsv',sep='\t')
data_test = pd.read_csv('D:\\CHENGXU\\lintcode\\dianying\\testData.tsv',sep='\t')

In [4]:
# 看训练集数据前5行，Phrase列为电影评论文本，Sentiment为情感标签
data_train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
# 共有25000行训练数据，每行数据都有句子ID、文本内容、情感标签三列
data_train.shape

(25000, 3)

In [6]:
# 查看测试集数据前5行，Phrase列就是需要我们自己构建模型预测情感标签的文本
data_test.head()

Unnamed: 0,id,review
0,3862_4,"I just watched it. A couple of laughs, but not..."
1,674_10,"While to most people watching the movie, this ..."
2,8828_10,I was so glad I came across this short film. I...
3,2963_8,The creators of south park in their own film h...
4,2483_1,"Unspeakably discombobulated turkey, a mix of a..."


In [7]:
# 共有5000行测试集数据，每个数据都有句子ID、文本内容两列
data_test.shape

(5000, 2)

构建语料库
需要对文本进行一些处理，将原始文本中的每一个词变成计算机看得懂的向量，这一过程叫做文本的特征工程，非常重要。
有很多将词变成向量的方法，比如下面将要介绍的词袋模型、TF-IDF模型，以及word2vec模型。
不管采用什么模型，我们都需要先把训练集和测试集中所有文本内容组合在一起，构建一个语料库。

In [8]:
# 提取训练集中的文本内容 
train_sentences = data_train['review']

# 提取测试集中的文本内容
test_sentences = data_test['review']

# 通过pandas的concat函数将训练集和测试集的文本内容合并到一起
sentences = pd.concat([train_sentences,test_sentences])

In [9]:
# 合并到一起的语料库共有30000行数据
sentences.shape

(30000,)

In [10]:
# 提取训练集中的情感标签，一共是25000个标签
label = data_train['sentiment']

In [11]:
label.shape

(25000,)

使用词袋模型进行文本特征工程
词袋模型

In [12]:
# 用sklearn库中的CountVectorizer构建词袋模型
# analyzer='word'指的是以词为单位进行分析，对于拉丁语系语言，有时需要以字母'character'为单位进行分析
# ngram指分析相邻的几个词，避免原始的词袋模型中词序丢失的问题
# max_features指最终的词袋矩阵里面包含语料库中出现次数最多的多少个词

from sklearn.feature_extraction.text import CountVectorizer
co = CountVectorizer(
    analyzer='word',
    ngram_range=(1,4),
    max_features=150000
)

In [13]:
# 使用语料库，构建词袋模型

co.fit(sentences)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=150000, min_df=1,
                ngram_range=(1, 4), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [14]:
# 将训练集随机拆分为新的训练集和验证集，默认3:1,然后进行词频统计
# 新的训练集和验证集都来自于最初的训练集，都是有标签的。

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train_sentences,label,random_state=1234)

In [15]:
# 随便看训练集中的一个数据
x_train[1]

'\\The Classic War of the Worlds\\" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells\' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur \\"critics\\" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the \\"critics\\". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells\' classic novel, and we found it to be very entertaining. This made it easy to overlook what the \\"critics\\" perceive to be its shortcomings."'

In [16]:
# 用上面构建的词袋模型，把训练集和验证集中的每一个词都进行特征工程，变成向量

x_train = co.transform(x_train)
x_test = co.transform(x_test)

In [17]:
# 随便看训练集中的一个数据，它是150000列的稀疏矩阵

x_train[1]

<1x150000 sparse matrix of type '<class 'numpy.int64'>'
	with 431 stored elements in Compressed Sparse Row format>

构建分类器算法，对词袋模型处理后的文本进行机器学习和数据挖掘
逻辑回归分类器

In [18]:
# 忽略下面代码执行过程中的版本警告等无用提示

import warnings 
warnings.filterwarnings('ignore')

In [19]:
from sklearn.linear_model import LogisticRegression
lg1 = LogisticRegression()
lg1.fit(x_train,y_train)
print('词袋方法进行文本特征工程，使用sklearn默认的逻辑回归分类器，验证集上的预测准确率:',lg1.score(x_test,y_test))

词袋方法进行文本特征工程，使用sklearn默认的逻辑回归分类器，验证集上的预测准确率: 0.89312


多项式朴素贝叶斯分类器

In [20]:
#引用朴素贝叶斯进行分类训练和预测
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(x_train,y_train)
print('词袋方法进行文本特征工程，使用sklearn默认的多项式朴素贝叶斯分类器，验证集上的预测准确率:',classifier.score(x_test,y_test))

词袋方法进行文本特征工程，使用sklearn默认的多项式朴素贝叶斯分类器，验证集上的预测准确率: 0.88288


多项式朴素贝叶斯分类器，训练速度很快，但准确率较低。

使用TF-IDF模型进行文本特征工程

TF值衡量了一个词出现的次数。
IDF值衡量了这个词是不是有用。如果是the、an、a等烂大街的词，IDF值就会很低。
两个值的乘积TF_IDF反映了一个词的出现带来的特异性信息。

In [21]:
# 用sklearn库中的TfidfVectorizer构建TF-IDF模型
# analyzer='word'指的是以词为单位进行分析，对于拉丁语系语言，有时需要以字母'character'为单位进行分析
# ngram指分析相邻的几个词，避免原始的词袋模型中词序丢失的问题
# max_features指最终的词袋矩阵里面包含语料库中出现次数最多的多少个词
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1,4),
    max_features=150000
)

In [22]:
tf.fit(sentences)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0,
                max_features=150000, min_df=1, ngram_range=(1, 4), norm='l2',
                preprocessor=None, smooth_idf=True, stop_words=None,
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

类似上面的操作，拆分原始训练集为训练集和验证集，用TF-IDF模型对每一个词都进行特征工程，变成向量

In [23]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train_sentences,label,random_state=1234)

In [24]:
x_train = tf.transform(x_train)
x_test = tf.transform(x_test)

In [25]:
x_train[1]

<1x150000 sparse matrix of type '<class 'numpy.float64'>'
	with 431 stored elements in Compressed Sparse Row format>

构建分类器算法，对TF-IDF模型处理后的文本进行机器学习和数据挖掘


朴素贝叶斯分类器

In [26]:
#引用朴素贝叶斯进行分类训练和预测
classifier = MultinomialNB()
classifier.fit(x_train,y_train)
print('TF-IDF方法进行文本特征工程，使用sklearn默认的多项式朴素贝叶斯分类器，验证集上的预测准确率:',classifier.score(x_test,y_test))

TF-IDF方法进行文本特征工程，使用sklearn默认的多项式朴素贝叶斯分类器，验证集上的预测准确率: 0.8896



逻辑回归分类器

In [27]:
# sklearn默认的逻辑回归模型
lg1 = LogisticRegression()
lg1.fit(x_train,y_train)
print('TF-IDF方法进行文本特征工程，使用sklearn默认的逻辑回归模型，验证集上的预测准确率:',lg1.score(x_test,y_test))

TF-IDF方法进行文本特征工程，使用sklearn默认的逻辑回归模型，验证集上的预测准确率: 0.89184


In [28]:
# C：正则化系数，C越小，正则化效果越强
# dual：求解原问题的对偶问题
lg2 = LogisticRegression(C=3, dual=True)
lg2.fit(x_train,y_train)
print('TF-IDF方法进行文本特征工程，使用增加了两个参数的逻辑回归模型，验证集上的预测准确率:',lg2.score(x_test,y_test))

TF-IDF方法进行文本特征工程，使用增加了两个参数的逻辑回归模型，验证集上的预测准确率: 0.90112


对比两个预测准确率可以看出，在逻辑回归中增加C和dual这两个参数可以提高验证集上的预测准确率，但如果每次都手动修改就太麻烦了。我们可以用sklearn提供的强大的网格搜索功能进行超参数的批量试验。
搜索空间：C从1到9。对每一个C，都分别尝试dual为True和False的两种参数。
最后从所有参数中挑出能够使模型在验证集上预测准确率最高的。

In [29]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C':range(1,10),
             'dual':[True,False]
              }
lgGS = LogisticRegression()
grid = GridSearchCV(lgGS, param_grid=param_grid,cv=3,n_jobs=-1)
grid.fit(x_train,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': range(1, 10), 'dual': [True, False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [30]:
grid.best_params_

{'C': 9, 'dual': True}

In [31]:
lg_final = grid.best_estimator_

In [32]:
print('经过网格搜索，找到最优超参数组合对应的逻辑回归模型，在验证集上的预测准确率:',lg_final.score(x_test,y_test))

经过网格搜索，找到最优超参数组合对应的逻辑回归模型，在验证集上的预测准确率: 0.90544


对测试集的数据进行预测，提交lintcode_AI竞赛最终结果

In [33]:
# 查看测试集数据前5行，review列就是需要我们自己构建模型预测情感标签的文本
data_test.head()

Unnamed: 0,id,review
0,3862_4,"I just watched it. A couple of laughs, but not..."
1,674_10,"While to most people watching the movie, this ..."
2,8828_10,I was so glad I came across this short film. I...
3,2963_8,The creators of south park in their own film h...
4,2483_1,"Unspeakably discombobulated turkey, a mix of a..."


In [34]:
# 使用TF-IDF对测试集中的文本进行特征工程
test_X = tf.transform(data_test['review'])

In [35]:
# 对测试集中的文本，使用lg_final逻辑回归分类器进行预测
predictions = lg_final.predict(test_X)

In [36]:
predictions

array([0, 1, 1, ..., 0, 0, 1], dtype=int64)

In [37]:
predictions.shape

(5000,)

In [38]:
# 将预测结果加在测试集中

data_test.loc[:,'sentiment'] = predictions

In [39]:
data_test.head()

Unnamed: 0,id,review,sentiment
0,3862_4,"I just watched it. A couple of laughs, but not...",0
1,674_10,"While to most people watching the movie, this ...",1
2,8828_10,I was so glad I came across this short film. I...,1
3,2963_8,The creators of south park in their own film h...,1
4,2483_1,"Unspeakably discombobulated turkey, a mix of a...",0


In [40]:
# 按lintcode官网上的要求整理成这样的格式

final_data = data_test.loc[:,['id','sentiment']]

In [41]:
final_data.head()

Unnamed: 0,id,sentiment
0,3862_4,0
1,674_10,1
2,8828_10,1
3,2963_8,1
4,2483_1,0


In [42]:
# 保存为.csv文件，即为最终结果

final_data.to_csv('D:\\CHENGXU\\lintcode\\dianying\\final_data1.csv',index=None)