In [1]:
import pandas as pd

In [2]:
#读取数据
data_train=pd.read_csv('./train.tsv',sep='\t')#分隔符号用\t
data_test=pd.read_csv('./test.tsv',sep='\t')

In [3]:
#Phraseld--短语ID
#Sentenceld--句子ID
#Phrase--电影评论文本
#Sentiment--情感标签
data_train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


0- negative

1-somewhat negative

2-neutral

3-somewhat positive

4-positive

In [4]:
#共有156060行训练数据,每行数据都有短语ID,句子ID,文本内容,情感标签
data_train.shape

(156060, 4)

In [5]:
#查看训练集中是否有空值
data_train.isnull().sum()

PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64

In [6]:
data_test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [7]:
#测试集中共有66292行数据,每个数据都有短语ID,句子ID,文本内容三列
data_test.shape

(66292, 3)

In [8]:
data_test.isnull().sum()

PhraseId      0
SentenceId    0
Phrase        0
dtype: int64

In [9]:
#提取训练集中的文本内容 使用pandas的切片功能
train_sentences=data_train['Phrase']

#提取测试集中的文本内容
test_sentences=data_test['Phrase']

#通过panads的concat函数将训练集和测试机的文本内容合并到一起
sentences=pd.concat([train_sentences,test_sentences])

In [10]:
#合并到一起的语料库共有222352行数据
sentences.shape

(222352,)

In [11]:
#提取训练集中的情感标签,一共是156060个标签
label=data_train['Sentiment']

In [12]:
label.shape

(156060,)

In [13]:
#导入停词库,停词库中的词是一些废话单词和语气词,对情感分析没有帮助
stop_words=open('./stop_words.txt',encoding='utf-8').read().splitlines()
#splitlines()返回一个列表

In [14]:
stop_words

["\ufeffain'",
 'happy',
 'isn',
 'ain',
 'al',
 'couldn',
 'didn',
 'doesn',
 'hadn',
 'hasn',
 'haven',
 'sn',
 'll',
 'mon',
 'shouldn',
 've',
 'wasn',
 'weren',
 'won',
 'wouldn',
 "'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'t",
 "'ve",
 'ZT',
 'ZZ',
 'a',
 "a's",
 'able',
 'about',
 'above',
 'abst',
 'accordance',
 'according',
 'accordingly',
 'across',
 'act',
 'actually',
 'added',
 'adj',
 'adopted',
 'affected',
 'affecting',
 'affects',
 'after',
 'afterwards',
 'again',
 'against',
 'ah',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'announce',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'apparently',
 'appear',
 'appreciate',
 'appropriate',
 'approximately',
 'are',
 'area',
 'areas',
 'aren',
 "aren't",
 'arent',
 'arise',
 'around',
 'as',
 'aside',
 'ask',
 'asked',
 'asking',
 

# 文本特征工程

In [15]:
#用Sklearn库中的CountVectorizer构建词袋模型

from sklearn.feature_extraction.text import CountVectorizer

co=CountVectorizer(
    analyzer='word',#指以词尾单位进行分析,对于拉丁系语言,有时需要以字母'character'为单位进行分析
    ngram_range=(1,4),#指分析相邻的几个词,避免原始词袋模型中词序丢失问题 表示选取1到4个词做为组合方式
    stop_words=stop_words,
    max_features=150000#选取语料库中出现次数最多的150000个词
)

In [16]:
#使用语料库,构建词袋模型
co.fit(sentences)

CountVectorizer(max_features=150000, ngram_range=(1, 4),
                stop_words=["\ufeffain'", 'happy', 'isn', 'ain', 'al', 'couldn',
                            'didn', 'doesn', 'hadn', 'hasn', 'haven', 'sn',
                            'll', 'mon', 'shouldn', 've', 'wasn', 'weren',
                            'won', 'wouldn', "'d", "'ll", "'m", "'re", "'s",
                            "'t", "'ve", 'ZT', 'ZZ', 'a', ...])

In [17]:
#将训练集随机拆分为新的训练集和验证集,默认为3:1,然后进行词频分析统计
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(train_sentences,label,random_state=1234)

In [18]:
#查看训练集中的数据
x_train[1]

'A series of escapades demonstrating the adage that what is good for the goose'

In [19]:
#用上边构建的词袋模型,把训练集和验证集中的每一个词都进行特征工程,变成向量

x_train=co.transform(x_train)
x_test=co.transform(x_test)

In [20]:
#随便查看训练集中数据 是150000列的稀疏矩阵
#在矩阵中，若数值为0的元素数目远远多于非0元素的数目，并且非0元素分布没有规律时，则称该矩阵为稀疏矩阵
x_train[1]

<1x150000 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [21]:
#logistic
from sklearn.linear_model import LogisticRegression
lg1=LogisticRegression()
lg1.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [22]:
print('词袋方法进行文本特征工程,使用sklearn默认的逻辑回归分类器,验证集上的预测准确率:',lg1.score(x_test,y_test))

词袋方法进行文本特征工程,使用sklearn默认的逻辑回归分类器,验证集上的预测准确率: 0.646161732666923


In [23]:
#引用朴素贝叶斯进行分类训练和预测
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()
classifier.fit(x_train,y_train)


MultinomialNB()

In [24]:
print('词袋方法进行文本特征工程,使用sklearn默认的多项式朴素贝叶斯分类器,验证集上的预测准确率:',classifier.score(x_test,y_test))

词袋方法进行文本特征工程,使用sklearn默认的多项式朴素贝叶斯分类器,验证集上的预测准确率: 0.6084070229398949


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(
    analyzer='word',#以词为单位进行分析
    ngram_range=(1,4),#指分析相邻的几个词,避免原始词袋模型中词序丢失问题 表示选取1到4个词做为组合方式
    #stop_words=stop_words   TF-IDF已经把常用词过滤掉了,所以不需要停用词
    max_features=150000#选取语料库中出现次数最多的150000个词
)

In [26]:
tf.fit(sentences)

TfidfVectorizer(max_features=150000, ngram_range=(1, 4))

In [27]:
#将训练集随机拆分为新的训练集和验证集,默认为3:1,然后进行词频分析统计
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(train_sentences,label,random_state=1234)

In [28]:
#进行文本特征变换
x_train=tf.transform(x_train)
x_test=tf.transform(x_test)

In [29]:
x_train[1] #稀疏矩阵

<1x150000 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [30]:
#引用朴素贝叶斯进行分类训练和预测
classifier=MultinomialNB()
classifier.fit(x_train,y_train)

MultinomialNB()

In [31]:
print('TF-IDF方法进行文本特征工程,使用sklearn默认的多项式朴素贝叶斯分类器,验证集上的预测准确率:',classifier.score(x_test,y_test))

TF-IDF方法进行文本特征工程,使用sklearn默认的多项式朴素贝叶斯分类器,验证集上的预测准确率: 0.6045367166474432


In [32]:
lg1=LogisticRegression()
lg1.fit(x_train,y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [33]:
print('TF-IDF方法进行文本特征工程,使用sklearn默认的逻辑斯蒂回归模型,验证集上的预测准确率:',lg1.score(x_test,y_test))

TF-IDF方法进行文本特征工程,使用sklearn默认的逻辑斯蒂回归模型,验证集上的预测准确率: 0.6410354991669871


In [51]:
#修改logistic 参数
lg2=LogisticRegression(C=3,dual=True,solver='liblinear')
#    C:正则化系数,C越小,正则化效果越强 ,L1,L2正则化
#dual:求解原问题的对偶问题
lg2.fit(x_train,y_train)

LogisticRegression(C=3, dual=True, solver='liblinear')

In [57]:
print(lg2)

LogisticRegression(C=3, dual=True, solver='liblinear')


In [58]:
dir(lg2)

['C',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_n_features',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_predict_proba_lr',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_validate_data',
 'class_weight',
 'classes_',
 'coef_',
 'decision_function',
 'densify',
 'dual',
 'fit',
 'fit_intercept',
 'get_params',
 'intercept_',
 'intercept_scaling',
 'l1_ratio',
 'max_iter',
 'multi_class',
 'n_features_in_',
 'n_iter_',
 'n_jobs',
 'penalty',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'random_state',
 'score',
 'set_params',
 'solver',
 'sparsify',
 'tol',
 'verbose',
 'w

In [72]:
print('TF-IDF方法进行文本特征工程,使用增加了三个参数的逻辑回归模型,验证集上的预测准备率为:',lg2.score(x_test,y_test))

TF-IDF方法进行文本特征工程,使用增加了三个参数的逻辑回归模型,验证集上的预测准备率为: 0.6533384595668332


In [71]:
#CV交叉验证
from sklearn.model_selection import GridSearchCV
param_grid={'C':range(1,10),
            'dual':[True,False]
            }
lgGS=LogisticRegression()


In [68]:
grid=GridSearchCV(lgGS,param_grid=param_grid,cv=3,n_jobs=-1)


grid.fit(x_train,y_train)
#默认情况下max_iter=100,
#这是说训练模型的时候，参数的迭代次数达到了限制(默认max_iter=100)，但是两次迭代参数变化还是比较大，仍然没有在一个很小的阈值以下，这就叫没有收敛。

#不过，这只是一个警告（温馨提示）而已，我们要么选择 
#1.忽略，要么 
#2.增大最大迭代次数，要么 
#3.更换其他的模型或者那个参数solver，要么 
#4.将数据进行预处理，提取更有用的特征。


In [73]:
#CV交叉验证
from sklearn.model_selection import GridSearchCV
param_grid={'C':range(1,10),
            'dual':[True,False]
            }
lgGS=LogisticRegression(max_iter=1000)
grid=GridSearchCV(lgGS,param_grid=param_grid,cv=3,n_jobs=-1)
grid.fit(x_train,y_train)

GridSearchCV(cv=3, estimator=LogisticRegression(max_iter=1000), n_jobs=-1,
             param_grid={'C': range(1, 10), 'dual': [True, False]})

In [74]:
#得出最佳参数
grid.best_params_

{'C': 3, 'dual': False}

最终的结果是C=3,DUAL=FALSE,能使逻辑回归模型在验证集上预测准确率最高,我们便采取用这个最优参数,构建lg_final分类器,最终在验证集上预测准确率为0.656465

In [75]:
lg_final=grid.best_estimator_

In [77]:
print('经过网格搜索,找到最优参数组合对应的逻辑回归模型,在验证集上的预测准确率:',lg_final.score(x_test,y_test))

经过网格搜索,找到最优参数组合对应的逻辑回归模型,在验证集上的预测准确率: 0.6564654620017942


# 使用测试集

In [79]:
data_test.head()
#查看测试机数据,phraseld列就是构建的模型预测情感标签的文本

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [80]:
#使用TF-IDF对测试集中的文本进行文本特征工程
test_X=tf.transform(data_test['Phrase'])

In [83]:
#对测试集中的文本,使用lg_final逻辑回归分类器进行预测
predictions=lg_final.predict(test_X)

In [84]:
predictions

array([2, 2, 3, ..., 1, 1, 2], dtype=int64)

In [85]:
predictions.shape

(66292,)

In [86]:
#将预测结果与测试集合并
data_test1=data_test#复制一份测试集
data_test1.loc[:,'Sentiment']=predictions


In [87]:
data_test1.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,156061,8545,An intermittently pleasing but mostly routine ...,2
1,156062,8545,An intermittently pleasing but mostly routine ...,2
2,156063,8545,An,3
3,156064,8545,intermittently pleasing but mostly routine effort,2
4,156065,8545,intermittently pleasing but mostly routine,2


In [92]:
#抽取 ,格式整理
final_data=data_test1.loc[:,['PhraseId','Sentiment']]

In [93]:
final_data.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,3
3,156064,2
4,156065,2


In [94]:
final_data.to_csv('final_data.csv',index=None)