In [1]:
from pyvi import ViTokenizer #For split vietnamese words
import pandas as pd #For reading xlsx file
from gensim.parsing.preprocessing import strip_non_alphanum, strip_multiple_whitespaces,preprocess_string, split_alphanum, strip_short, strip_numeric
import re
import numpy as np
import string
from sklearn.utils import shuffle

In [2]:
# calculate value of sigmoid of a variation
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [3]:
# calculate value of sigmoid of a linear z=X.w
def prob(w,X):
    return sigmoid(X.dot(w))

In [4]:
# loss function
def loss(w, X, y, lam):
    z = prob(w,X)
    return -np.mean(y*np.log(z) + (1-y)*np.log(1-z) + 0.5*lam/X.shape[0]*np.sum(w*w))

In [5]:
# training function with regulation norm l1
def logistic_regression(w_init, X, y, lam =0.001, lr=0.1, nEpoches=2000):
    N, d= X.shape
    print(N)
    w = w_old = w_init
    loss_hist =[loss(w_init, X, y, lam)]
    ep = 0
    while ep < nEpoches:
        ep += 1;
        mix_ids = np.random.permutation(N)
        for i in mix_ids:
            xi= X[i]
            yi = y[i]
            zi = sigmoid(xi.dot(w))
            w = w -lr*((zi-yi)*xi+lam*w)
        loss_hist.append(loss(w, X, y, lam))
        if np.linalg.norm(w - w_old)/d <1e-6:
            break
        w_old = w
    return w, loss_hist

In [6]:
#classify with X is a matrix (N,D),  is the number of samples, D is the number of feature
def predict(w, X, threshold =0.5):
    res = np.zeros(X.shape[0])
    res[np.where(prob(w,X)> threshold)[0]]=1
    return res

In [7]:
#accurracy of Model with w parameter
def accurracy(w,X_test, y_text):
    y_predict = predict(w,X_test)
    N=X_test.shape[0]
    count=0
    for m in range(N):
        if (y_predict[m] == y_test[m]):
            count+=1;
    return count/N

In [8]:
#example with predict beastcancer problem 
# A = np.loadtxt('wdbc.txt', delimiter=',')
# y = y_test=np.array(A[:,1])
# X = X_test=np.array(A[:,2:])
# w_init = np.random.randn(X.shape[1])
# lam =0.0001
# w, loss_hist=logistic_regression(w_init, X, y, lam, lr =0.05, nEpoches = 500)
# x_input =np.array(A[:, 2:])
# print(x_input.shape[0])
# #y_pre=predict(w,x_input,threshold = 0.5)
# a=accurracy(w,X_test, y_test)
# print(a)

In [9]:
def raw_text_preprocess(raw):
    raw = raw.lower()
    raw = re.sub(r"http\S+", "URL_Change", raw)
    raw = re.sub(r"https\S+", "URL_Change", raw)
    raw = re.sub(r"(www\S+)|(\S+@\S+)", "URL_Change", raw)
    raw = re.sub(r"([0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9])|([0-9][0-9]/[0-9][0-9])|([0-9][0-9]/[0-9][0-9][0-9][0-9])", "DATE_Change", raw)
    raw = re.sub(r"([0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9])|([0-9][0-9]-[0-9][0-9])|([0-9][0-9]-[0-9][0-9][0-9][0-9])", "DATE_Change", raw)
    raw = re.sub(r"(ngày [0-9][0-9] tháng [0-9][0-9] năm [0-9][0-9][0-9][0-9])|(ngày [0-9][0-9] tháng [0-9][0-9])|(tháng [0-9][0-9] năm [0-9][0-9][0-9][0-9])", "DATE_Change", raw)

    raw = strip_numeric(raw)
    raw = ViTokenizer.tokenize(raw)
    raw = strip_short(raw, minsize=2)
    return raw

In [10]:
document = []
label = []
def read_file(file):
    with open(file, 'r', encoding = 'utf-8') as f:

        for line in f:
            document.append(line[7: ])
            if ('non' in line[: 6]):
                label.append(0)
            else:
                label.append(1)

read_file('data_mail.txt')
read_file('data_email.txt')
print(len(document))

391


In [11]:
document = [raw_text_preprocess(d) for d in document]
document_test = document[: 80]
label_test = label[: 80]
document = document[80 :]
label = label[80 :]
# print(document_test)

In [12]:
def dict_word(document):
    res = {}
    words = {}
    conten = 0
    for doc in document:
#             token = change(doc)
            token = doc.split(" ")
            for tok in token:
                if tok in res.keys():
                    res[tok] = res[tok] + 1
                else:
                    res.update({tok : 1})
            for key, value in res.items():
                if (value >= 3):
                    words.update({key : value})
    words = sorted(words.keys())
    return words

In [13]:
def read_data(document, label, words):
    word_list = []
    N = len(document)
#     conten = 0
    for doc in document:
#         token = change(doc)
        token = doc.split(" ")
        word = {}
        for tok in token:
            if tok in word.keys():
                word[tok] = word[tok] + 1
            else:
                word.update({tok : 1})
        word_list.append(word)
    data = dat = np.zeros((N, len(words)), dtype = int)
    j = 0
    for word in word_list:
        dat = []
        for key in words:
            if(key in word.keys()):
                dat.append(word[key])
            else:
                dat.append(0)
        data[j, :] = np.array(np.asarray(dat))
        j = j + 1
#     return (np.array(np.asarray(data)), np.array(np.asarray(label)))
    return (data, np.array(np.asarray(label)))

In [14]:
words = dict_word(document)
(X_train, y_train) = read_data(document, label, words)
(X_test, y_test) = read_data(document_test, label_test, words)
# print(len(train_data))
# print(len(train_data[0]))
# print(train_data[0])


In [15]:
w_init = np.random.randn(X_train.shape[1])
lam =0.00001
w, loss_hist=logistic_regression(w_init, X_train, y_train, lam, lr =0.05, nEpoches = 1000)
# x_input =np.array(A[:, 2:])
# print(x_input.shape[0])
# y_pre=predict(w,x_input,threshold = 0.5)
a=accurracy(w,X_test, y_test)
print(a)

NameError: name 'train_data' is not defined

In [None]:
print(train_data.shape[1])