In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import vstack, csr_matrix, coo_matrix
import sys
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [9]:
def load_data(fpath, code="big5hkscs"):
    """
    Read bytes as list of string
    """
    lines = []
    num_errors = 0
    for line in open(fpath, "rb"):
        try:
            lines.append(line.rstrip().decode(code))
        except UnicodeDecodeError as e:
            num_errors += 1
    print('Encountered %d decoding errors.' % num_errors)
    
    return lines

def extract_ngram(s, n, delimiter=' ', get_map=False):
    """
    Extract n grams from string s, using defined single or multiple separator
    """
    size = len(s)
    if get_map: # result stored in set, no class label
        res = set()
    else: # result store in list, with class label
        res = []
    left = 0
    while left<size:
        if s[left] == delimiter:
            left += 1
        else:
            idx = [left] # store indices for n grams
            right = left
            for i in range(1, n): # find n-1 non-separator characters
                right += 1
                while right<size: # find next non-separator characters
                    if s[right]==delimiter:
                        right += 1
                    else:
                        idx.append(right)
                        break;
            if len(idx) == n: # n grams found
                a = int((len(idx)-1)/2) # lower median
                temps = ""
                for x in idx:
                    temps += s[x]
                    
                if get_map:
                    if temps not in res:
                        res.add(temps)
                else:
                    if idx[a]+1 == idx[a+1]: # 4 gram "abcd", no seperator between b and c 
                        res.append([temps, 0])
                    else:
                        res.append([temps, 1])
            left += 1
                
    return res

def parse_all(data_list):
    """
    Concatenate list of strings to one string, separated by double space 
    """
    temp_line = ""
    for line in data_list:
        temp_line += (line + "  ")
    
    return temp_line

def set2dict(s):
    """
    Convert Set object to dict object
    """
    d = {}
    for i, k in enumerate(s):
        d[k] = i
    return d

def encode_word(gram4, grams_map):
    """
    Using [ab, b, bc, c, cd] vector to encode 4-gram abcd. ab, b, bc, c, cd are encoded in a one vector whose
    size is the number of unique 1 and 2 grams in training and testing datasets
    """
    x = []
    y = []
    count = 0
    for g in gram4:
        f, v = g # parse 4 gram and class label
        sgram = np.array([f[0:2], f[1:2], f[1:3], f[2:3], f[2:4]]) # encoding vector
        # Use coordinate format sparse matrix
        row = np.zeros([len(sgram), ])
        col = []
        data = np.ones([len(sgram), ])
        for e in sgram:
            if e not in grams_map: # error handle when not find any element of encoding vector
                raise KeyError("Feature not found.")
            col.append(grams_map[e]) # column number is the position in all 1 and 2 grams
        a = coo_matrix((data, (row, col)), shape=[1, len(grams_map)])
        x.append(a)
        y.append(v)
        
        count += 1
        
    x = vstack(x)
    y = np.stack(y, axis=0)
    return x, y

def evaluate(clf, x, y):
    y_pred = clf.predict(x)
    tp, fp, tn, fn = 0, 0, 0, 0
    for i in range(len(y_pred)):
        if y_pred[i] == 1 and y[i]==1:
            tp += 1
        if y_pred[i] == 1 and y[i] == 0:
            fp += 1
        if y_pred[i] == 0 and y[i] == 1:
            fn += 1
        if y_pred[i] == 0 and y[i] == 0:
            tn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    acc = (tp + tn) / (tp + fp + tn + fn)
    print("tp, fp, tn, fn: ", tp, fp, tn, fn)
    
    return precision, recall, acc

## Load data as list

In [4]:
code = "big5hkscs"
data_te_str = load_data("test.txt", code=code)
data_tr_str = load_data("training.txt", code=code)


Encountered 0 decoding errors.
Encountered 11 decoding errors.


## Concatenate into one string

In [5]:
data = parse_all(data_tr_str+data_te_str)
data_tr = parse_all(data_tr_str)
data_te = parse_all(data_te_str)

In [7]:
data_tr_str[0:10]

['時間  ：',
 '三月  十日  （  星期四  ）  上午  十時  。',
 '地點  ：',
 '學術  活動  中心  一樓  簡報室  。',
 '主講  ：',
 '民族所  所長  莊英章  先生  。',
 '講題  ：',
 '閩  、  台  漢人  社會  研究  的  若干  考察  。',
 '李  院長  於  二月  二十六日  至  三月  十五日  赴  美  訪問  ，',
 '期間  將  與  在  美  院士  商討  院務  ，']

In [7]:
print(len(data))
data_tr[0:100]

20631845


'時間  ：  三月  十日  （  星期四  ）  上午  十時  。  地點  ：  學術  活動  中心  一樓  簡報室  。  主講  ：  民族所  所長  莊英章  先生  。  講題  '

## Get grams dictionary

In [8]:
grams1 = extract_ngram(data, 1, delimiter=' ', get_map=True)
grams2 = extract_ngram(data, 2, delimiter=' ', get_map=True)
ngrams = grams1.union(grams2)
len(grams1), len(grams2), len(ngrams)

(6140, 746397, 752537)

In [10]:
del grams1, grams2

In [11]:
ngrams = set2dict(ngrams)

## Get training and testing data tuple

In [12]:
grams4_tr = extract_ngram(data_tr, 4, delimiter=' ', get_map=False)
grams4_te = extract_ngram(data_te, 4, delimiter=' ', get_map=False)

In [13]:
len(grams4_tr), len(grams4_te)

(8976185, 18740)

In [14]:
gram_data_tr = encode_word(grams4_tr, ngrams)
gram_data_te = encode_word(grams4_te, ngrams)

## Train classifier using logistic regression model 

In [15]:
x_tr, y_tr = gram_data_tr
x_tr = x_tr.tocsr() # row format of sparse matrix, for row slicing
x_te, y_te = gram_data_te
x_te = x_te.tocsr()

In [16]:
clf = LogisticRegression(C=1, penalty="l2", solver="sag")
clf.fit(x_tr[:, :], y_tr[:])

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)

In [34]:
# Save model
with open("logreg.mdpkl", "wb") as f:
    pickle.dump(clf, f)

In [37]:
# Load model
with open("logreg.mdpkl", "rb") as f:
    clf = pickle.load(f)

## Evaluation

In [17]:
print("Testing results.")
precision, recall, accuracy = evaluate(clf, x_te, y_te)
print("Precision: ", precision)
print("Recall: ", recall)
print("Accuracy: ", accuracy)

Testing results.
tp, fp, tn, fn:  11108 829 5929 874
Precision:  0.9305520650079584
Recall:  0.9270572525454849
Accuracy:  0.9091248665955176


In [18]:
print("Training results.")
precision, recall, accuracy = evaluate(clf, x_tr, y_tr)
print("Precision: ", precision)
print("Recall: ", recall)
print("Accuracy: ", accuracy)

Training results.
tp, fp, tn, fn:  5457796 289201 2880514 348674
Precision:  0.9496778926454982
Recall:  0.9399507790447552
Accuracy:  0.9289369592984101
