## 导入数据

In [1]:
import pandas as pd

train = pd.read_csv('/Users/aeternae/Desktop/Data/IMDB/labeledTrainData.tsv', sep='\t')
test = pd.read_csv('/Users/aeternae/Desktop/Data/IMDB/testData.tsv', sep='\t')

In [2]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [4]:
train.count()

id           25000
sentiment    25000
review       25000
dtype: int64

In [5]:
test.count()

id        25000
review    25000
dtype: int64

## 文本数据预处理

In [6]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [7]:
def review_to_text(review, remove_stopwords):
    #去掉html标记
    raw_text = BeautifulSoup(review, 'html.parser').get_text()
    #去掉非字母字符
    letters = re.sub('[^a-zA-Z]', ' ', raw_text)
    words = letters.lower().split()
    #去掉停用词
    if remove_stopwords:
        all_stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in all_stop_words]
 
    return words

In [8]:
X_train = [' '.join(review_to_text(review, True)) for review in train['review']]
X_train[0]

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate workin

In [9]:
X_test = [' '.join(review_to_text(review, True)) for review in test['review']]

In [10]:
y_train = train['sentiment']
y_train.shape

(25000,)

## 文本特征抽取

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### CountVec 词频矩阵

In [12]:
count_vec = CountVectorizer()

In [13]:
X_count = count_vec.fit_transform(X_train)

In [18]:
count_vec.get_feature_names()[:10]

['aa',
 'aaa',
 'aaaaaaah',
 'aaaaah',
 'aaaaatch',
 'aaaahhhhhhh',
 'aaaand',
 'aaaarrgh',
 'aaah',
 'aaargh']

In [16]:
X_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Tf_Idf 词频矩阵

In [17]:
tfidf_vec = TfidfVectorizer()

In [18]:
X_tfidf = tfidf_vec.fit_transform(X_train)

In [19]:
print(X_tfidf)

  (0, 62941)	0.03354045373162439
  (0, 26968)	0.07217307173962342
  (0, 42578)	0.034091761849276245
  (0, 42350)	0.7703186447281194
  (0, 62043)	0.03487297244743932
  (0, 38035)	0.04751617903415886
  (0, 43600)	0.054295467513819196
  (0, 71488)	0.023073898100782347
  (0, 45703)	0.03926522998399744
  (0, 18360)	0.03764384002495705
  (0, 71482)	0.05675846026639458
  (0, 72628)	0.07163890269512017
  (0, 42819)	0.13307302119828857
  (0, 40560)	0.08512551931849857
  (0, 71313)	0.024749737549733447
  (0, 26303)	0.018496729631339144
  (0, 10346)	0.03669746300625778
  (0, 32847)	0.047604829337073856
  (0, 28243)	0.054847839906128716
  (0, 65818)	0.02525032488112832
  (0, 52926)	0.03509094232335691
  (0, 13681)	0.07064593076344598
  (0, 20094)	0.053346084856950304
  (0, 39462)	0.019295315365876368
  (0, 41891)	0.02908726933031267
  :	:
  (24999, 11024)	0.2336985200945862
  (24999, 69731)	0.0694608436759633
  (24999, 56821)	0.0789407453282018
  (24999, 8212)	0.10419563187936105
  (24999, 66496)	

In [20]:
X_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## 多模型预测

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [22]:
X1_train = X_train[:20000]
X1_test = X_train[20000:]
y1_train = y_train[:20000]
y1_test = y_train[20000:]

In [23]:
len(X1_train), len(X1_test), len(y1_train), len(y1_test)

(20000, 5000, 20000, 5000)

### MultinomialNB

In [24]:
from sklearn.naive_bayes import MultinomialNB

In [25]:
def MNB_count_Classifier():              #采用管道将步骤进行封装
    return Pipeline([
        ('count_vec', CountVectorizer()),  
        ('mnb', MultinomialNB())       #训练贝叶斯模型
    ])

In [26]:
mnbc_clf = MNB_count_Classifier()
mnbc_clf.fit(X1_train, y1_train)

Pipeline(memory=None,
     steps=[('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [27]:
mnbc_clf.score(X1_test, y1_test)

0.8534

In [28]:
def MNB_tfidf_Classifier():              #采用管道将步骤进行封装
    return Pipeline([
        ('tfidf_vec', TfidfVectorizer()),  
        ('mnb', MultinomialNB())       #训练贝叶斯模型
    ])

In [29]:
mnbt_clf = MNB_tfidf_Classifier()
mnbt_clf.fit(X1_train, y1_train)

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=T...rue,
        vocabulary=None)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [30]:
mnbt_clf.score(X1_test, y1_test)

0.8598

### 逻辑回归

In [31]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression

In [32]:
def LogisticRegression_c():              #采用管道将步骤进行封装
    return Pipeline([
        ('count_vec', CountVectorizer()),  
#         ('poly', PolynomialFeatures(degree=degree)),            #添加多项式项
        ('logistic', LogisticRegression(C=0.1, penalty='l2'))     #训练逻辑回归模型
    ])

In [33]:
polyc_log_reg = LogisticRegression_c()
polyc_log_reg.fit(X1_train, y1_train)



Pipeline(memory=None,
     steps=[('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [34]:
polyc_log_reg.score(X1_test, y1_test)

0.8806

In [35]:
def LogisticRegression_t():              #采用管道将步骤进行封装
    return Pipeline([
        ('tfidf_vec', TfidfVectorizer()),  
#         ('poly', PolynomialFeatures(degree=degree)),            #添加多项式项
        ('logistic', LogisticRegression(C=0.1, penalty='l2'))     #训练逻辑回归模型
    ])

In [36]:
polyt_log_reg = LogisticRegression_t()
polyt_log_reg.fit(X1_train, y1_train)

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=T...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [37]:
polyt_log_reg.score(X1_test, y1_test)

0.8606

### rbfSVM

In [28]:
from sklearn.svm import SVC

def RBFKernelSVC_c(gamma=1.0):
    return Pipeline([
        ('count_vec', CountVectorizer()),  
        ("svc", SVC(kernel="rbf", gamma=gamma)) #使用高斯核函数，并传入参数γ
    ])

In [70]:
svcc_clf = RBFKernelSVC_c()
svcc_clf.fit(X1_train, y1_train)

Pipeline(memory=None,
     steps=[('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [71]:
svcc_clf.score(X1_test, y1_test)

0.497

### Ensemble

#### RandomForest

In [29]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
def RFC_c():
    return Pipeline([
        ('count_vec', CountVectorizer()),  
        ('rfc', RandomForestClassifier(n_estimators=500, max_depth=3, random_state=666, n_jobs=-1)) 
    ])

In [29]:
rfcc_clf = RFC_c()
rfcc_clf.fit(X1_train, y1_train)

Pipeline(memory=None,
     steps=[('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...mators=500, n_jobs=-1,
            oob_score=False, random_state=666, verbose=0, warm_start=False))])

In [30]:
rfcc_clf.score(X1_test, y1_test)

0.8378

In [31]:
def RFC_t():
    return Pipeline([
        ('tfidf_vec', TfidfVectorizer()),  
        ('rfc', RandomForestClassifier(n_estimators=500, max_depth=3, random_state=666, n_jobs=-1)) 
    ])

In [32]:
rfct_clf = RFC_t()
rfct_clf.fit(X1_train, y1_train)

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=T...mators=500, n_jobs=-1,
            oob_score=False, random_state=666, verbose=0, warm_start=False))])

In [33]:
rfct_clf.score(X1_test, y1_test)

0.8308

#### ExtraTree

In [38]:
from sklearn.ensemble import ExtraTreesClassifier

In [39]:
def ETC_c():
    return Pipeline([
        ('count_vec', CountVectorizer()),  
        ('etc', ExtraTreesClassifier(n_estimators=500, bootstrap=True, random_state=666, n_jobs=-1)) 
    ])

In [40]:
etc_clf = ETC_c()
etc_clf.fit(X1_train, y1_train)

Pipeline(memory=None,
     steps=[('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...imators=500, n_jobs=-1,
           oob_score=False, random_state=666, verbose=0, warm_start=False))])

In [41]:
etc_clf.score(X1_test, y1_test)

0.877

#### AdaBoost

In [34]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [35]:
def Ada_c():
    return Pipeline([
        ('count_vec', CountVectorizer()),  
        ('adaboost', AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=500)) 
    ])

In [42]:
adac_clf = Ada_c()
adac_clf.fit(X1_train, y1_train)

Pipeline(memory=None,
     steps=[('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...e,
            splitter='best'),
          learning_rate=1.0, n_estimators=500, random_state=None))])

In [44]:
adac_clf.score(X1_test, y1_test)

0.8426

#### GBDT

In [36]:
from sklearn.ensemble import GradientBoostingClassifier

In [37]:
def GBDT_t():
    return Pipeline([
        ('tfidf_vec', TfidfVectorizer()),  
        ('GBDT', GradientBoostingClassifier(max_depth=3, n_estimators=50))
    ])

In [54]:
gbt_clf = GBDT_t()
gbt_clf.fit(X1_train, y1_train)

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=T...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))])

In [55]:
gbt_clf.score(X1_test, y1_test)

0.777

#### XGboost

In [42]:
import xgboost as xgb

In [43]:
def XGB_t():
    return Pipeline([
        ('tfidf_vec', TfidfVectorizer()),  
        ('XGB', xgb.XGBClassifier(n_estimators=500, max_depth=6, learning_rate=0.1,
                                  subsample=.7, colsample_bytree=0.6, gamma=0.05))
    ])

In [44]:
xgbt_clf = XGB_t()
xgbt_clf.fit(X1_train, y1_train)

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=T...eg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.7))])

In [45]:
xgbt_clf.score(X1_test, y1_test)

0.8598

#### Lightgbm

In [35]:
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [36]:
def LGB_t():
    return Pipeline([
        ('tfidf_vec', TfidfVectorizer()),  
        ('LGB', lgb.LGBMClassifier(n_estimators=500, max_depth=-1, colsample_bytree=0.6))
    ])

In [None]:
lgbt_clf = LGB_t()
lgbt_clf.fit(X1_train, y1_train)

In [64]:
lgbt_clf.score(X1_test, y1_test)

0.8598

### Voting Classifier

In [46]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[
    ("mnb_clf", MNB_tfidf_Classifier()),
    ("log_clf", LogisticRegression_c()),
#     ("rfc_clf", RFC_c()),
    ("etc_clf", ETC_c()),
    ("xgb_clf", XGB_t()),
#     ("lgb_clf", LGB_t())
], voting='soft')    

In [47]:
voting_clf.fit(X1_train, y1_train)



VotingClassifier(estimators=[('mnb_clf', Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range...alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.7))]))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [48]:
voting_clf.score(X1_test, y1_test)

0.884

In [55]:
from sklearn.metrics import roc_auc_score

In [56]:
y_predict = voting_clf.predict(X1_test)

In [57]:
roc_auc_score(y1_test, y_predict)

0.8843170087255745