In [1]:
import numpy as np
import pandas as pd
from pytorch_pretrained_bert import BertTokenizer
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef, confusion_matrix,recall_score,f1_score,precision_score,accuracy_score


In [2]:
def text_to_feature(path):
    df=pd.read_csv(path,sep="\t",header=None)
    sentences=df[3]
    label=df[1].values
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

    vocab=open("outputs/SST2/vocab.txt",encoding="utf-8").read().strip().split("\n")
    vocab_len=len(vocab)
    sen_len=len(sentences)
    
    x=[]
    for c,i in enumerate(sentences):
        tokens=tokenizer.tokenize(i)
        ids=tokenizer.convert_tokens_to_ids(tokens)
        tx=np.array([0]*vocab_len)
        tx[ids]=1
        x.append(tx)
        print(round(c/sen_len,3),end="\r")
    x=np.array(x)
    return x,label


def get_eval_report(task_name, labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    rec=recall_score(labels,preds)
    f1=f1_score(labels,preds)
    prec=precision_score(labels,preds)
    acc=accuracy_score(labels,preds)
    return {
        "task": task_name,
        "mcc": mcc,
        "F1":f1,
        "recall":rec,
        "precision":prec,
        "accuracy":acc,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn
    }

def compute_metrics(task_name, labels, preds):
    assert len(preds) == len(labels)
    return get_eval_report(task_name, labels, preds)

def show(dic):
    for i in dic:
        if i=="task":
            print("%10s:%35s"%(i,dic[i]))
        else:
            print("%10s:%35.3f"%(i,dic[i]))


In [3]:
def load_data():
    if os.path.exists("data/train_x.npy"):
        train_x=np.load("data/train_x.npy")
        train_y=np.load("data/train_y.npy")
        test_x=np.load("data/test_x.npy")
        test_y=np.load("data/test_y.npy")
    else:
        train_x,train_y=text_to_feature("data/train.tsv")
        test_x,test_y=text_to_feature("data/dev.tsv")
    return train_x,train_y,test_x,test_y
    
    
    
train_x,train_y,test_x,test_y=load_data()

1.099

In [4]:
test_x.shape

(13470, 28996)

In [5]:
logreg = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')

In [6]:
logreg.fit(train_x, train_y)



LogisticRegression(C=100000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='multinomial', n_jobs=None,
                   penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
                   verbose=0, warm_start=False)

In [7]:
pred=logreg.predict(test_x)

In [8]:
result_dic=compute_metrics("Naive Bayes Sentiment Analysis",test_y,pred)

In [9]:
show(result_dic)

      task:     Naive Bayes Sentiment Analysis
       mcc:                              0.789
        F1:                              0.907
    recall:                              0.910
 precision:                              0.904
  accuracy:                              0.896
        tp:                           6813.000
        tn:                           5257.000
        fp:                            723.000
        fn:                            677.000
