In [1]:
import numpy as np
import pandas as pd
import sklearn

### 1 数据的提取与划分

In [2]:
train=pd.read_csv("train.tsv",sep='\t')
train.shape

(156060, 4)

In [3]:
test=pd.read_csv("test.tsv",sep='\t')
test.shape

(66292, 3)

In [4]:
train.keys()

Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')

In [5]:
train_phrase=train['Phrase']
train_sentiment=train['Sentiment']
train_phrase.shape,train_sentiment.shape

((156060,), (156060,))

In [6]:
test_phrase=test['Phrase']
test_phrase.shape

(66292,)

### 2 特征提取

[特征提取参考](https://blog.csdn.net/weixin_40547993/article/details/90296785)

#### 2.1 Bag-of-words

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
# 词袋模型通过调用该函数实现
count_bow=CountVectorizer()
train_phrase_bow=count_bow.fit_transform(train_phrase)
test_phrase_bow=count_bow.transform(test_phrase)

In [8]:
train_phrase_bow.shape,test_phrase_bow.shape

((156060, 15240), (66292, 15240))

#### 2.2 N-gram

In [9]:
count_ng=CountVectorizer(ngram_range=(2,3),max_features=4000)
train_phrase_ng=count_ng.fit_transform(train_phrase)
test_phrase_ng=count_ng.fit_transform(test_phrase)

In [10]:
train_phrase_ng.shape,test_phrase_ng.shape

((156060, 4000), (66292, 4000))

### 3 训练

In [11]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(alpha=0.001,
                    loss='log',    #hinge代表SVM，log是逻辑回归
                    early_stopping=True,
                    eta0=0.1,
                    learning_rate='adaptive', #constant、optimal、invscaling、adaptive
                    max_iter=100 
                   )

In [12]:
from sklearn.utils import shuffle
train_phrase_bow,train_sentiment=shuffle(train_phrase_bow,train_sentiment)

In [13]:
clf.fit(train_phrase_bow,train_sentiment)



In [14]:
predict=clf.predict(test_phrase_bow)

In [15]:
test['Sentiment']=predict
test[['Sentiment','PhraseId']].set_index('PhraseId').to_csv('finale.csv')