In [16]:
import numpy as np
import pandas as pd
import sklearn

### 1 对数据的提取与划分

In [17]:
train=pd.read_csv("train.tsv",sep='\t')
train.shape

(156060, 4)

In [18]:
train.keys()

Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')

In [19]:
phrase=train['Phrase']
sentiment=train['Sentiment']
phrase.shape,sentiment.shape

((156060,), (156060,))

In [20]:
from sklearn.model_selection import train_test_split
train_phrase,test_phrase,train_sentiment,test_sentiment=train_test_split(phrase,sentiment,test_size=0.3)
train_phrase.shape,test_phrase.shape,train_sentiment.shape,test_sentiment.shape

((109242,), (46818,), (109242,), (46818,))

### 2 特征提取

[特征提取参考](https://blog.csdn.net/weixin_40547993/article/details/90296785)

#### 2.1 Bag-of-words

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
# 词袋模型通过调用该函数实现
count_bow=CountVectorizer()
train_phrase_bow=count_bow.fit_transform(train_phrase)
test_phrase_bow=count_bow.transform(test_phrase)

In [22]:
train_phrase_bow.shape,test_phrase_bow.shape

((109242, 15218), (46818, 15218))

#### 2.2 N-gram

In [23]:
count_ng=CountVectorizer(ngram_range=(2,3),max_features=4000)
train_phrase_ng=count_ng.fit_transform(train_phrase)
test_phrase_ng=count_ng.fit_transform(test_phrase)

In [24]:
train_phrase_ng.shape,test_phrase_ng.shape

((109242, 4000), (46818, 4000))

### 3 训练

[SGDC分类器](https://zhuanlan.zhihu.com/p/60983320)

#### 3.1 使用BOW特征

In [36]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(alpha=0.001,
                    loss='log',    #hinge代表SVM，log是逻辑回归
                    early_stopping=True,
                    eta0=0.1,
                    learning_rate='adaptive', #constant、optimal、invscaling、adaptive
                    max_iter=100 
                   )

In [37]:
clf.fit(train_phrase_bow,train_sentiment)

SGDClassifier(alpha=0.001, early_stopping=True, eta0=0.1,
              learning_rate='adaptive', loss='log', max_iter=100)

In [38]:
predict1=clf.predict(test_phrase_bow)

In [39]:
np.mean(predict1==test_sentiment)

0.5477380494681533

#### 3.2 使用NG特征

In [14]:
clf.fit(train_phrase_ng,train_sentiment)

SGDClassifier(alpha=0.001, early_stopping=True, eta0=0.001,
              learning_rate='adaptive', loss='log', max_iter=100)

In [15]:
predict2=clf.predict(test_phrase_ng)

In [16]:
np.mean(predict2==test_sentiment)

0.5045495322311931

#### 3.3 使用组合特征

In [17]:
from scipy.sparse import hstack
train_feature=hstack([train_phrase_bow,train_phrase_ng])
test_feature=hstack([test_phrase_bow,test_phrase_ng])

In [18]:
clf.fit(train_feature,train_sentiment)

SGDClassifier(alpha=0.001, early_stopping=True, eta0=0.001,
              learning_rate='adaptive', loss='log', max_iter=100)

In [19]:
predict3=clf.predict(test_feature)

In [20]:
np.mean(predict3==test_sentiment)

0.5349438250245632