In [1]:
import numpy as np
import random
import re
import nltk
import math
from tabulate import tabulate

In [2]:
def extract_lines(corpus):
    all_lines = list()
    with open(corpus, 'r', encoding = 'latin-1') as rfile:
        for line in rfile:
            all_lines.append(line)
    
    return all_lines

In [3]:
def tokenize_lines_by_words(lines):
    new_lines = list()
    for line in lines:
        new_line = line.lower()
        new_lines.append(nltk.word_tokenize(new_line))
    
    return new_lines

In [4]:
def clean_alphabetic_text_lines(lines):
    new_lines = list()
    for line in lines:
        new_line = list()
        for word in line:
            token = list()
            for c in word:
                #[a-záéíóúñü+$]
                if re.match(r'^[a-záéíóúñü+$]', c):
                    token.append(c)
            token = ''.join(token)
            if token != '':
                new_line.append(token)
        new_lines.append(new_line)
        
    return new_lines

In [5]:
def remove_stop_words(lines):
    stopwords = nltk.corpus.stopwords.words('english')
    clean_lines = list()
    for line in lines:
        clean_line = list()
        for word in line:
            if word not in stopwords:
                clean_line.append(word)
        clean_lines.append(clean_line)
    
    return clean_lines

In [6]:
def get_X_y(lines):
    X = list()
    y = list()
    for line in lines:
        n = len(line)
        tag = line.pop(n - 1)
        corpus = line
        X.append(corpus)
        y.append(tag)
    return [X, y]

In [7]:
def transform_tag(y):
    new_y = list()
    for i in y:
        if i == 'spam':
            new_i = 1
        else:
            new_i = 0
        new_y.append(new_i)
    return np.array(new_y)

In [8]:
def lemmatize_X(X):
    lemmas_X = list()
    tags = ['j', 'n', 'r', 'v']
    wnl = nltk.WordNetLemmatizer()
    for line in X:
        tagged_line = nltk.pos_tag(line)
        #print(tagged_line)
        lemmas_line = list()
        for token in tagged_line:
            tag = token[1]
            if tag in tags:
                lemmatized_token = wnl.lemmatize(token, tag)
            else:
                lemmatized_token = token[0]
            lemmas_line.append(lemmatized_token)
        lemmas_X.append(lemmas_line)
    return lemmas_X

In [9]:
def get_vocabulary(X):
    words = list()
    for line in X:
        for word in line:
            words.append(word)
    vocabulary = list(sorted(set(words)))
    return vocabulary

In [10]:
def get_matrix_X(X, vocabulary):
    matrix_X = list()
    for line in X:
        xi = list()
        xi.append(1)
        m = len(line)
        for word in vocabulary:
            if m != 0:
                xi.append(line.count(word) / m) 
            else:
                xi.append(m)
        xi = np.array(xi)
        matrix_X.append(xi)
    matrix_X = np.array(matrix_X)
    return matrix_X.T

In [11]:
def separateSet(X, Y, test_percentage):
    data_zipped = list(zip(X, Y))
    random.shuffle(data_zipped) # revuelve la data 
    X, Y = zip(*data_zipped) # descomprime el iterable dado

    total_test = math.ceil(len(Y) * test_percentage)
    total_train = len(Y) - total_test

    X_train = X[:total_train]
    X_test = X[total_train:]
    Y_train = Y[:total_train]
    Y_test = Y[total_train:]

    return np.array(X_train), np.array(Y_train), np.array(X_test), np.array(Y_test)

In [12]:
def initialize_w(n):
    w = np.zeros(n)
    return w

In [13]:
def get_z(matrix_X, w):
    z = np.dot(matrix_X, w.T)
    return z

In [14]:
def prediction(z):
    new_Y = np.exp(z) / (1 + np.exp(z))
    return np.array(new_Y)

In [15]:
def j(Y, prediction):
    sum1 = np.sum(Y * np.log(prediction))
    sum2 = np.sum((1 - Y) * np.log(1 - prediction))
    result = - (sum1 + sum2) / len(Y)
    return result

In [16]:
def j_partial(matrix_X, Y, prediction):
    result = list()
    for x in matrix_X.T:
        result.append((1 / len(Y)) * np.dot((prediction - Y), x))
    return np.array(result)

In [17]:
def get_new_w(w, jpartial, alpha):
    #print(w)
    new_w =  w - (alpha * jpartial)
    #print(jpartial)
    #print(new_w)
    return new_w

In [18]:
def train(matrix_X, Y, w, alpha, iterations = 1000):
    for i in range(iterations):
        z = get_z(matrix_X, w)
        new_Y = prediction(z)
        error = j(Y, new_Y)
        new_w = j_partial(matrix_X, Y, new_Y)
        w = get_new_w(w, new_w, alpha)
        if i % 50 == 0:
            print("In iteration", i, "the cost function is", error)
    return w

In [19]:
def test(matrix_X, Y, w):
    z = get_z(matrix_X, w)
    Y_pred = prediction(z)
    conver_predictions = list()
    for i in Y_pred:
        if i >= 0.5:
            conver_predictions.append(1)
        else: 
            conver_predictions.append(0)
    table = []
    correct = 0
    for i in range(len(Y)):
        #error = abs(100 - (100 / (Y[i]) * Y_pred[i]))
        table.append([Y[i], Y_pred[i], conver_predictions[i]])
        if Y[i] == 1:
            if Y_pred[i] >= 0.5:
                correct += 1
        else:
            if Y_pred[i] < 0.5:
                correct += 1
    
    print(tabulate(table, headers = ['Real','Prediction', 'Class Prediction'], tablefmt = "grid", numalign = "center"))
    print("Cost function: ", j(Y, Y_pred))
    print("Accuracy: ", 100 * correct / len(Y), "%")

In [20]:
all_lines = extract_lines('./SMS_Spam_Corpus_big.txt')

In [21]:
random.shuffle(all_lines)

In [22]:
tokenized_lines = tokenize_lines_by_words(all_lines)

In [23]:
clean_alphabetic_lines = clean_alphabetic_text_lines(tokenized_lines)

In [24]:
clean_lines = remove_stop_words(clean_alphabetic_lines)

In [25]:
X, y = get_X_y(clean_lines)

In [26]:
Y = transform_tag(y)

In [27]:
X_lemmatized = lemmatize_X(X)

In [28]:
vocabulary = get_vocabulary(X_lemmatized)

In [29]:
matrix_X = get_matrix_X(X_lemmatized, vocabulary)

In [30]:
X_train, Y_train, X_test, Y_test = separateSet(matrix_X.T, Y, 0.2)

In [31]:
w = initialize_w(len(vocabulary) + 1)

In [32]:
alpha = 9.5

In [33]:
w_train = train(X_train, Y_train, w, alpha, iterations = 1000)

In iteration 0 the cost function is 0.6931471805599452
In iteration 50 the cost function is 0.4321248060153905
In iteration 100 the cost function is 0.36433540011009385
In iteration 150 the cost function is 0.3186713419255036
In iteration 200 the cost function is 0.2853367570642287
In iteration 250 the cost function is 0.25964606750897984
In iteration 300 the cost function is 0.23906045479506666
In iteration 350 the cost function is 0.22207976227907503
In iteration 400 the cost function is 0.20775704033106263
In iteration 450 the cost function is 0.1954623477908106
In iteration 500 the cost function is 0.18475836386982314
In iteration 550 the cost function is 0.17533059805288329
In iteration 600 the cost function is 0.16694612588876406
In iteration 650 the cost function is 0.15942808579758444
In iteration 700 the cost function is 0.15263930608194676
In iteration 750 the cost function is 0.1464714408924956
In iteration 800 the cost function is 0.14083755045814253
In iteration 850 the co

In [34]:
w_train

array([ -0.64651945,  -0.52617109,   6.4598637 , ...,   0.48433764,
        -0.69215148, -11.90519725])

In [35]:
test(X_test, Y_test, w_train)

+--------+--------------+--------------------+
|  Real  |  Prediction  |  Class Prediction  |
|   0    |   0.143419   |         0          |
+--------+--------------+--------------------+
|   1    |   0.869885   |         1          |
+--------+--------------+--------------------+
|   1    |   0.838697   |         1          |
+--------+--------------+--------------------+
|   0    |  0.0914519   |         0          |
+--------+--------------+--------------------+
|   0    |   0.020895   |         0          |
+--------+--------------+--------------------+
|   0    |  0.0054198   |         0          |
+--------+--------------+--------------------+
|   1    |   0.580007   |         1          |
+--------+--------------+--------------------+
|   0    |  0.0239415   |         0          |
+--------+--------------+--------------------+
|   0    |  0.0278126   |         0          |
+--------+--------------+--------------------+
|   0    |   0.020895   |         0          |
+--------+---

In [36]:
def get_confussion_matriz(X_test, Y_test, w_train):
    z = get_z(X_test, w_train)
    predictions = prediction(z)
    conver_predictions = list()
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for i in range(len(Y_test)):
        if predictions[i] >= 0.5 and Y_test[i] == 1:
            TP = TP + 1
        elif predictions[i] >= 0.5 and Y_test[i] == 0:
            FP = FP + 1
        elif predictions[i] < 0.5 and Y_test[i] == 1:
            FN = FN + 1
        elif predictions[i] < 0.5 and Y_test[i] == 0:
            TN = TN + 1
    print('TP:', TP)
    print('FP:', FP)
    print('FN:',FN)
    print('TN:',TN)
    confussion = [[TP, FP],[FN, TN]]
    print(tabulate(confussion, headers=['Actual Value Positive', 'Actual Value Negative'], showindex=['Predicted Value Positive', 'Predicted Value Negative'], tablefmt = 'grid', numalign='center'))
    recall = TP / (TP + FN)
    print('Recall:', recall)
    precision = TP / (TP + FP)
    print('Precision', precision)
    accuracy = (2 * recall * precision) / (recall + precision)
    print('F-measure', accuracy)

In [37]:
len(all_lines)

1324

In [38]:
get_confussion_matriz(X_test, Y_test, w_train)

TP: 59
FP: 3
FN: 9
TN: 194
+--------------------------+-------------------------+-------------------------+
|                          |  Actual Value Positive  |  Actual Value Negative  |
| Predicted Value Positive |           59            |            3            |
+--------------------------+-------------------------+-------------------------+
| Predicted Value Negative |            9            |           194           |
+--------------------------+-------------------------+-------------------------+
Recall: 0.8676470588235294
Precision 0.9516129032258065
F-measure 0.9076923076923077
