In [1]:
import numpy as np
import random
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

In [2]:
def get_lines(corpus):
    all_lines = list()
    with open(corpus, 'r', encoding = 'latin-1') as rfile:
        for line in rfile:
            all_lines.append(line)
    return all_lines

In [3]:
def tokenize(lines):
    new_lines = list()
    for line in all_lines:
        new_line = line.lower()
        new_lines.append(word_tokenize(new_line))
    return new_lines

In [4]:
def clean_text(lines):
    new_lines = list()
    for line in lines:
        new_line = list()
        for word in line:
            token = list()
            for c in word:
                #[a-záéíóúñü+$]
                if re.match(r'^[a-záéíóúñü+$]', c):
                    token.append(c)
            token = ''.join(token)
            if token != '':
                new_line.append(token)
        new_lines.append(new_line)
    return new_lines

In [5]:
def get_X_y(lines):
    X = list()
    y = list()
    for line in lines:
        n = len(line)
        tag = line.pop(n - 1)
        corpus = line
        X.append(corpus)
        y.append(tag)
    return [X, y]

In [6]:
def transform_tag(y):
    new_y = list()
    for i in y:
        if i == 'spam':
            new_i = 1
        else:
            new_i = 0
        new_y.append(new_i)
    return np.array(new_y)

In [7]:
def lemmatize_X(X):
    lemmas_X = list()
    wnl = nltk.WordNetLemmatizer()
    for line in X:
        lemmas_line = list()
        for token in line:
            lemmatize_token = wnl.lemmatize(token)
            lemmas_line.append(lemmatize_token)
        lemmas_X.append(lemmas_line)
    return lemmas_X

In [8]:
def delete_stopwords(X):
    stopwords_list = stopwords.words('english')
    new_X = list()
    for line in X:
        new_line = list()
        for word in line:
            if word not in stopwords_list:
                new_line.append(word)
        new_X.append(new_line)
    return new_X

In [9]:
def get_vocabulary(X):
    words = list()
    for line in X:
        for word in line:
            words.append(word)
    vocabulary = list(sorted(set(words)))
    return vocabulary

In [10]:
def get_Corpus(X, vocabulary):
    matrix_X = list()
    for line in X:
        xi = list()
        xi.append(1)
        m = len(line)
        for word in vocabulary:
            if m != 0:
                xi.append(line.count(word) / m) 
            else:
                xi.append(m)
        xi = np.array(xi)
        matrix_X.append(xi)
    matrix_X =  np.array(matrix_X)
    return matrix_X.T

In [11]:
def initialize_w(n):
    w = np.zeros(n)
    return w

In [12]:
def get_z(Matrix, w):
    z = np.dot(w.T, Matrix)
    return z

In [29]:
def prediction(z):
    new_y = np.exp(z) / (1 + np.exp(z))
    return np.array(new_y)

In [14]:
def j(y, prediction):
    sum1 = np.sum(y * np.log(prediction))
    sum2 = np.sum((1 - y) * np.log(1 - prediction))
    result = - (sum1 + sum2) / len(y)
    return result

In [15]:
def j_partial(X, y, prediction):
    result = list()
    for x in X:
        result = (1 / len(y)) * np.dot((prediction - y), x)
    return np.array(result)

In [35]:
def get_new_w(w, jpartial, alpha = 0.0005):
    new_w =  w - (alpha * jpartial)
    return new_w

In [17]:
def train(X, y, w, iterations = 1000):
    for i in range(iterations):
        z = get_z(X, w)
        new_y = prediction(z)
        error = j(y, new_y)
        print(error)
        new_w = j_partial(X, y, new_y)
        w = get_new_w(w, new_w)
    return w

In [18]:
all_lines = get_lines('./SMS_Spam_Corpus_big.txt')

In [19]:
random.shuffle(all_lines)

In [20]:
tokenized_lines = tokenize(all_lines)

In [21]:
clean_lines = clean_text(tokenized_lines)

In [22]:
X, y = get_X_y(clean_lines)

In [23]:
y = transform_tag(y)

In [24]:
X_lemmatized = lemmatize_X(X)

In [25]:
vocabulary = get_vocabulary(X_lemmatized)

In [26]:
Matrix_X = get_Corpus(X_lemmatized, vocabulary)

In [27]:
n = len(vocabulary)
w = initialize_w(n + 1)

In [36]:
w = train(Matrix_X, y, w)

0.5902418273721125
0.5902412505270596
0.5902406736887081
0.5902400968570579
0.5902395200321089
0.5902389432138607
0.5902383664023138
0.5902377895974678
0.5902372127993225
0.5902366360078781
0.5902360592231346
0.5902354824450919
0.5902349056737499
0.5902343289091084
0.5902337521511675
0.5902331753999271
0.5902325986553874
0.5902320219175476
0.5902314451864085
0.5902308684619696
0.5902302917442309
0.5902297150331923
0.590229138328854
0.5902285616312155
0.5902279849402773
0.5902274082560389
0.5902268315785003
0.5902262549076617
0.5902256782435227
0.5902251015860834
0.5902245249353439
0.5902239482913036
0.5902233716539635
0.5902227950233223
0.5902222183993808
0.5902216417821385
0.5902210651715956
0.5902204885677521
0.5902199119706076
0.5902193353801625
0.5902187587964162
0.5902181822193688
0.5902176056490206
0.5902170290853713
0.5902164525284208
0.590215875978169
0.5902152994346163
0.5902147228977619
0.5902141463676062
0.5902135698441491
0.5902129933273904
0.5902124168173304
0.590211840313

0.5899909576235487
0.5899903836948142
0.5899898097727561
0.5899892358573742
0.5899886619486684
0.5899880880466388
0.5899875141512854
0.589986940262608
0.5899863663806064
0.5899857925052808
0.5899852186366312
0.5899846447746573
0.589984070919359
0.5899834970707364
0.5899829232287894
0.5899823493935181
0.5899817755649224
0.5899812017430021
0.589980627927757
0.5899800541191874
0.5899794803172929
0.589978906522074
0.5899783327335298
0.589977758951661
0.5899771851764672
0.5899766114079484
0.5899760376461046
0.5899754638909356
0.5899748901424413
0.5899743164006219
0.5899737426654773
0.5899731689370071
0.5899725952152117
0.5899720215000906
0.5899714477916442
0.5899708740898725
0.5899703003947746
0.5899697267063515
0.5899691530246023
0.5899685793495274
0.5899680056811267
0.5899674320194002
0.5899668583643474
0.5899662847159688
0.5899657110742639
0.5899651374392331
0.5899645638108759
0.5899639901891924
0.5899634165741827
0.5899628429658467
0.5899622693641843
0.5899616957691952
0.589961122180879

0.5897413570267456
0.5897407860034712
0.5897402149868486
0.5897396439768772
0.5897390729735573
0.5897385019768886
0.5897379309868711
0.5897373600035049
0.5897367890267895
0.5897362180567253
0.5897356470933123
0.58973507613655
0.5897345051864387
0.5897339342429778
0.5897333633061682
0.5897327923760091
0.5897322214525008
0.5897316505356429
0.5897310796254357
0.5897305087218787
0.5897299378249724
0.5897293669347164
0.5897287960511106
0.5897282251741552
0.5897276543038502
0.5897270834401949
0.5897265125831899
0.5897259417328348
0.5897253708891299
0.5897248000520748
0.5897242292216694
0.589723658397914
0.5897230875808083
0.5897225167703521
0.5897219459665458
0.5897213751693889
0.5897208043788817
0.5897202335950238
0.5897196628178154
0.5897190920472563
0.5897185212833462
0.5897179505260857
0.5897173797754742
0.589716809031512
0.5897162382941987
0.5897156675635344
0.5897150968395192
0.5897145261221526
0.5897139554114351
0.5897133847073663
0.5897128140099461
0.5897122433191746
0.58971167263505

In [31]:
z = get_z(Matrix_X, w)

In [32]:
z

array([-0.54266406, -0.54266406, -0.54266406, ..., -0.54266406,
       -0.54266406, -0.54266406])

In [33]:
new_y = prediction(z)

In [34]:
for i in range(len(y)):
    if new_y[i] >= 0.4:
        print(y[i], new_y[i])

0 0.43258011325643514
