In [1]:
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

In [2]:
train_data = pd.read_csv('data/train.tsv', sep='\t')
valid_data = pd.read_csv('data/dev.tsv', sep='\t')
test_data = pd.read_csv('data/test.tsv', sep='\t') 
x_train, y_train = train_data.text_a.values, train_data.label.values # 训练集
x_valid, y_valid = valid_data.text_a.values, valid_data.label.values # 验证集
x_test, y_test = test_data.text_a.values, test_data.label.values # 测试集

In [3]:
# 分词
from sklearn.metrics import classification_report

def cut_words(text):
    return ' '.join(jieba.cut(text))

x_train = [cut_words(text) for text in x_train]
x_valid = [cut_words(text) for text in x_valid]
x_test = [cut_words(text) for text in x_test]

# tf-idf
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_valid = vectorizer.transform(x_valid)
x_test = vectorizer.transform(x_test)


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\86180\AppData\Local\Temp\jieba.cache
Loading model cost 0.731 seconds.
Prefix dict has been built successfully.


In [4]:
# use other ml model
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
# use more model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [5]:
# SVM
print("svm:")
clf_SVC = SVC(kernel='linear')
clf_SVC.fit(x_train, y_train)
report_svm = classification_report(y_test, clf_SVC.predict(x_test),digits=4)
print(report_svm)
print("___________________________________________________")

svm:
              precision    recall  f1-score   support

           0     0.8954    0.8970    0.8962       592
           1     0.8995    0.8980    0.8988       608

    accuracy                         0.8975      1200
   macro avg     0.8975    0.8975    0.8975      1200
weighted avg     0.8975    0.8975    0.8975      1200

___________________________________________________


In [6]:
# Naive Bayes
print("naive bayes:")
clf_NB = MultinomialNB()
clf_NB.fit(x_train, y_train)
report_nb = classification_report(y_test, clf_NB.predict(x_test),digits=4)
print(report_nb)
print("___________________________________________________")

naive bayes:
              precision    recall  f1-score   support

           0     0.8821    0.8716    0.8768       592
           1     0.8764    0.8865    0.8814       608

    accuracy                         0.8792      1200
   macro avg     0.8792    0.8791    0.8791      1200
weighted avg     0.8792    0.8792    0.8792      1200

___________________________________________________


In [7]:
# Logistic Regression
print("losgistic regression:")
clf_LR = LogisticRegression()
clf_LR.fit(x_train, y_train)
report_lr = classification_report(y_test, clf_LR.predict(x_test),digits=4)
print(report_lr)
print("___________________________________________________")

losgistic regression:
              precision    recall  f1-score   support

           0     0.8853    0.8868    0.8861       592
           1     0.8896    0.8882    0.8889       608

    accuracy                         0.8875      1200
   macro avg     0.8875    0.8875    0.8875      1200
weighted avg     0.8875    0.8875    0.8875      1200

___________________________________________________


In [8]:
# Random Forest
print("random forest:")
clf_RF = RandomForestClassifier()
clf_RF.fit(x_train, y_train)
report_rf = classification_report(y_test, clf_RF.predict(x_test),digits=4)
print(report_rf)
print("___________________________________________________")

random forest:
              precision    recall  f1-score   support

           0     0.8792    0.9223    0.9002       592
           1     0.9206    0.8766    0.8981       608

    accuracy                         0.8992      1200
   macro avg     0.8999    0.8995    0.8992      1200
weighted avg     0.9002    0.8992    0.8991      1200

___________________________________________________


In [9]:
# Decision Tree
print("decision tree:")
clf_DT = DecisionTreeClassifier()
clf_DT.fit(x_train, y_train)
report_dt = classification_report(y_test, clf_DT.predict(x_test),digits=4)
print(report_dt)
print("___________________________________________________")

decision tree:
              precision    recall  f1-score   support

           0     0.7938    0.8260    0.8096       592
           1     0.8236    0.7911    0.8070       608

    accuracy                         0.8083      1200
   macro avg     0.8087    0.8086    0.8083      1200
weighted avg     0.8089    0.8083    0.8083      1200

___________________________________________________


In [10]:
# AdaBoost
print("adaboost:")
clf_AdB = AdaBoostClassifier()
clf_AdB.fit(x_train, y_train)
report_ada = classification_report(y_test, clf_AdB.predict(x_test),digits=4)
print(report_ada)
print("___________________________________________________")


adaboost:
              precision    recall  f1-score   support

           0     0.7694    0.8057    0.7871       592
           1     0.8017    0.7648    0.7828       608

    accuracy                         0.7850      1200
   macro avg     0.7855    0.7853    0.7850      1200
weighted avg     0.7858    0.7850    0.7849      1200

___________________________________________________


In [11]:

# GradientBoosting
print("gradient boosting:")
clf_GB = GradientBoostingClassifier()
clf_GB.fit(x_train, y_train)
report_gb = classification_report(y_test, clf_GB.predict(x_test),digits=4)
print(report_gb)
print("___________________________________________________")


gradient boosting:
              precision    recall  f1-score   support

           0     0.7763    0.8851    0.8272       592
           1     0.8705    0.7516    0.8067       608

    accuracy                         0.8175      1200
   macro avg     0.8234    0.8184    0.8169      1200
weighted avg     0.8240    0.8175    0.8168      1200

___________________________________________________


In [12]:
# SGD
print("sgd:")
clf_SGD = SGDClassifier()
clf_SGD.fit(x_train, y_train)
report_sgd = classification_report(y_test, clf_SGD.predict(x_test),digits=4)
print(report_sgd)
print("___________________________________________________")


sgd:
              precision    recall  f1-score   support

           0     0.8918    0.9054    0.8986       592
           1     0.9065    0.8931    0.8998       608

    accuracy                         0.8992      1200
   macro avg     0.8992    0.8992    0.8992      1200
weighted avg     0.8993    0.8992    0.8992      1200

___________________________________________________


In [13]:
# Bagging
print("bagging:")
clf_BG = BaggingClassifier()
clf_BG.fit(x_train, y_train)
report_bag = classification_report(y_test, clf_BG.predict(x_test),digits=4)
print(report_bag)
print("___________________________________________________")

bagging:
              precision    recall  f1-score   support

           0     0.7887    0.8953    0.8386       592
           1     0.8826    0.7664    0.8204       608

    accuracy                         0.8300      1200
   macro avg     0.8356    0.8309    0.8295      1200
weighted avg     0.8363    0.8300    0.8294      1200

___________________________________________________


In [16]:
# get the information of every report from above like precision, recall, f1-score, accuracy and make a table
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def get_report(report):
    report = report.split("\n")
    res = []
    for i in report:
        i = re.sub(" +", " ", i.strip())
        temp = i.split(" ")
        res.append(temp[1:])

    return res

def get_df(report):
    res = get_report(report)
    acc = float(res[5][0])
    precision = float(res[6][1])
    recall = float(res[6][2])
    f1 = float(res[6][3])
    return  [acc, precision, recall, f1]

svm_df = get_df(report_svm)
lr_df = get_df(report_lr)
nb_df = get_df(report_nb)
rf_df = get_df(report_rf)
dt_df = get_df(report_dt)
ada_df = get_df(report_ada)
gb_df = get_df(report_gb)
sgd_df = get_df(report_sgd)
bag_df = get_df(report_bag)
df = pd.DataFrame([svm_df, lr_df, nb_df, rf_df,  dt_df, ada_df, gb_df, sgd_df, bag_df],
                    columns=['accuracy', 'precision', 'recall', 'f1-score'],
                    index=['svm', 'lr', 'nb', 'rf', 'dt', 'ada', 'gb',  'sgd', 'bag'])

In [17]:
df

Unnamed: 0,accuracy,precision,recall,f1-score
svm,0.8975,0.8975,0.8975,0.8975
lr,0.8875,0.8875,0.8875,0.8875
nb,0.8792,0.8792,0.8791,0.8791
rf,0.8992,0.8999,0.8995,0.8992
dt,0.8083,0.8087,0.8086,0.8083
ada,0.785,0.7855,0.7853,0.785
gb,0.8175,0.8234,0.8184,0.8169
sgd,0.8992,0.8992,0.8992,0.8992
bag,0.83,0.8356,0.8309,0.8295


#### 0-> neg   |   1-> pos

In [18]:
# use rf model to predict the test data

sen = ['这什么垃圾酒店', '很棒的酒店']

def predict(sent,clf):
    X = [cut_words(sent)]
    X = vectorizer.transform(X)
    if type(sent) == str:
        return clf.predict(X)[0]
    else:
        result = []
        for i in range(len(sent)):
            result.append(clf.predict(X[i])[0])
        return result

def predict_muti(sen,clf_list):
    X = [cut_words(text) for text in sen]
    X = vectorizer.transform(X)
    result = []
    for clf in clf_list:
        res = []
        for i in range(len(sen)):
            res.append(clf.predict(X[i])[0])
        result.append(res)
    return result

model_list = [clf_SVC, clf_NB, clf_LR, clf_RF, clf_DT, clf_AdB, clf_GB, clf_SGD,clf_BG]
print(predict_muti(sen,model_list))
print(predict(sen[0],clf_RF))


[[0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0]]
0
