In [3]:
import pandas as pd
import numpy as np
import load_data as ld

In [4]:
data = ld.loadData()

# Task 1a

In [21]:
def train(matrix, labels):
    """
    @matrix: cols : wordcounts, row: mails
    @labels: spam not spam
    @returns: p_x_spam, p_x_non_spam, p_tot_spam, p_tot_non_spam
    """
    # Convert matrix to dataframe
    df = pd.DataFrame(matrix)
    # print('Matrix:\n ', df.head(5))

    # TrainLabels represents an array consisting of 0/1 entries. The ith entry of this array
    # indicates whether the ith document is spam (1 = spam, 0 = non-spam).
    # print('Labels:\n ', labels[:5], 'Size: ', labels.shape)

    # Add column spam from labels
    df['spam'] = labels
    # print('Matrix with labels:\n ', df.head(5))

    # Calculate the number of messages that are spam
    # print('Number of columns with are marked as spam: ', df[df['spam'] == 1][1].count())

    # Calculate the probability of spam messages
    pspam = df[df['spam'] == 1][1].count() / len(df)
    # print('Probability of spam messages', pspam)

    # Calculate the probability of non-spam messages
    pnonspam = df[df['spam'] == 0][1].count() / len(df)
    # print('Probability of non-spam messages', pnonspam)

    bag_spam = df[df['spam'] == 1].sum()
    # print('Number of spam words within each message (bag_spam):\n', bag_spam[:5])

    bag_nonspam = df[df['spam'] == 0].sum()
    # print('Number of non-spam words within each message (nonbag_spam):\n', bag_nonspam[:5])

    bag_spam_totalwords = bag_spam.drop('spam').sum()
    # print('bag_spam_totalwords', bag_spam_totalwords)

    bag_non_spam_totalwords = bag_nonspam.drop('spam').sum()
    # print('bag_non_spam_totalwords', bag_non_spam_totalwords)

    p_spam = bag_spam / bag_spam_totalwords
    # print('Probability of bag_spam words per message:\n', p_spam.head(5))
    p_not_spam = bag_nonspam / bag_non_spam_totalwords
    # print('Probability of non_bag_spam words per message:\n', p_not_spam.head(5))

    return p_spam.drop('spam').values, p_not_spam.drop('spam').values, pspam, pnonspam

In [23]:
train(data['trainMatrix'], data['trainLabels'])

Probability of spam messages 0.49906716417910446
Probability of non-spam messages 0.5009328358208955


(array([0.00026494, 0.0008232 , 0.00011354, ..., 0.00038795, 0.00292378,
        0.00013247]),
 array([2.17891626e-04, 9.39657638e-04, 1.49800493e-04, ...,
        1.36182266e-05, 5.71965519e-04, 1.77036946e-04]),
 0.49906716417910446,
 0.5009328358208955)

# Task 1b

In [27]:
def test(test_data, test_labels, p_x_spam, p_x_non_spam, p_tot_spam, p_tot_non_spam):
    """
    returns error rate, predicted labels
    """
    p_m_spam = [np.prod(np.where(td >= 1, p_x_spam ** td, 1)) for td in test_data]
    p_m_non_spam = [np.prod(np.where(td >= 1, p_x_non_spam ** td, 1)) for td in test_data]
    p_m_spam = np.array(p_m_spam)
    p_m_non_spam = np.array(p_m_non_spam)
    p_m_and_spam = p_m_spam * p_tot_spam
    p_m_and_non_spam = p_m_non_spam * p_tot_non_spam
    # [1,0,1,1] == [1,1,1,1] = 1,0,1,1 = 3
    correct =  sum((p_m_and_spam > p_m_and_non_spam) == test_labels)
    my_solution = (p_m_and_spam > p_m_and_non_spam)
    error_rate = (len(test_labels) - correct) / len(test_labels)
    return error_rate, my_solution



# Task 1a, Test Cases

In [28]:
# Test Case Training 1
test(data['trainMatrixEx1'],
     data['testLabelsEx1'],
     *train(data['trainMatrixEx1'],
            data['trainLabelsEx1'])
     )

Probability of spam messages 0.75
Probability of non-spam messages 0.25


  correct =  sum((p_m_and_spam > p_m_and_non_spam) == test_labels)


TypeError: 'bool' object is not iterable

In [26]:
# Test Case Training 2
test(data['trainMatrixEx2'],
     data['trainLabelsEx2'],
     *train(data['trainMatrixEx2'],
            data['trainLabelsEx2'])
     )

Probability of spam messages 0.375
Probability of non-spam messages 0.625


(0.0, array([ True,  True,  True, False, False, False, False, False]))

# Task 1b, Test Cases

In [10]:
# Test Case Test 1
test(data['testMatrixEx1'],
     data['testLabelsEx1'],
     *train(data['trainMatrixEx1'],
            data['trainLabelsEx1'])
     )

(0.3333333333333333, array([ True,  True,  True, False,  True,  True]))

In [11]:
# Test Case Test 2
test(data['testMatrixEx2'],
     data['testLabelsEx2'],
     *train(data['trainMatrixEx2'],
           data['trainLabelsEx2'])
)

(0.16666666666666666, array([ True,  True, False, False, False, False]))

# Task 2, , Script train_laplace

In [12]:
def train_laplace(matrix, labels):
    """
    @matrix: cols : wordcounts, row: mails
    @labels: spam not spam
    @returns: p_x_spam, p_x_non_spam, p_tot_spam, p_tot_non_spam
    """
    df = pd.DataFrame(matrix)
    df['spam'] = labels
    pspam = df[df['spam'] == 1][1].count() / len(df)
    pnonspam = df[df['spam'] == 0][1].count() / len(df)
    bag_spam = df[df['spam'] == 1].sum()
    bag_nonspam = df[df['spam'] == 0].sum()
    bag_spam_totalwords = bag_spam.drop('spam').sum()
    bag_non_spam_totalwords = bag_nonspam.drop('spam').sum()
    p_spam = (bag_spam + np.ones(bag_spam.values.shape)) / (bag_spam_totalwords + len(df.columns)-1)
    p_not_spam = (bag_nonspam + np.ones(bag_nonspam.values.shape))/ (bag_non_spam_totalwords + len(df.columns)-1)
    return p_spam.drop('spam').values, p_not_spam.drop('spam').values, pspam, pnonspam

# Task 2, Script test_laplace

In [17]:
def test_laplace(test_data, test_labels, p_x_spam, p_x_non_spam, p_tot_spam, p_tot_non_spam):
    """
    returns error rate, predicted labels
    """

    p_m_spam = ([np.sum(np.log(p_x_spam) * td) for td in test_data])
    p_m_non_spam = ([np.sum(np.log(p_x_non_spam) * td) for td in test_data])
    p_m_spam = np.array(p_m_spam)
    p_m_non_spam = np.array(p_m_non_spam)
    # With np.log do a sum not a multiplication
    p_m_and_spam = p_m_spam + np.log(p_tot_spam, where=p_tot_spam >0)
    p_m_and_non_spam = p_m_non_spam + np.log(p_tot_non_spam, where = p_tot_non_spam > 0)



    correct =  sum((p_m_and_spam > p_m_and_non_spam) == test_labels)
    return (len(test_labels) - correct) / len(test_labels), (p_m_and_spam > p_m_and_non_spam)

# Task 2, Test Cases

First Test Case

In [18]:
# Task 2, First case, correct results
comp = test_laplace(data['testMatrixEx1'],
                    data['testLabelsEx1'],
                    *train_laplace(data['trainMatrixEx1'],
                                   data['trainLabelsEx1'])
                    )

act = train_laplace(data['trainMatrixEx1'],
                    data['trainLabelsEx1'])


act, comp

((array([0.2, 0.2, 0.3, 0.1, 0.2]),
  array([0.25      , 0.16666667, 0.16666667, 0.16666667, 0.25      ]),
  0.75,
  0.25),
 (0.5, array([ True,  True,  True,  True,  True,  True])))

Second Test Case

In [19]:
# Task 2, Second case, correct results
comp = test_laplace(data['testMatrixEx2'],
                    data['testLabelsEx2'],
                    *train_laplace(data['trainMatrixEx2'],
                                   data['trainLabelsEx2'])
                    )

act = train_laplace(data['trainMatrixEx2'],
                    data['trainLabelsEx2'])


act, comp

((array([0.52, 0.12, 0.08, 0.16, 0.12]),
  array([0.2 , 0.12, 0.32, 0.12, 0.24]),
  0.375,
  0.625),
 (0.16666666666666666, array([ True,  True, False, False, False, False])))

Third Test Case

In [20]:
# Task 2, Third case, wrong results
# How come that here are wrong results?
comp = test_laplace(data['testMatrixEx3'],
                    data['testLabelsEx3'],
                    *train_laplace(data['trainMatrixEx3'],
                                   data['trainLabelsEx3'])
                    )

act = train_laplace(data['trainMatrixEx3'],
                    data['trainLabelsEx3'])


act, comp

((array([0.00060569, 0.00060569, 0.00060569, 0.00121139, 0.00060569,
         0.00060569, 0.00060569, 0.00060569, 0.00060569, 0.00060569,
         0.00060569, 0.00060569, 0.00060569, 0.01090248, 0.00181708,
         0.00060569, 0.00121139, 0.00181708, 0.00121139, 0.00181708,
         0.00060569, 0.00060569, 0.00060569, 0.00060569, 0.00060569,
         0.00060569, 0.00060569, 0.00121139, 0.00060569, 0.00060569,
         0.00060569, 0.00060569, 0.00060569, 0.00181708, 0.00060569,
         0.00060569, 0.00121139, 0.00060569, 0.00060569, 0.00060569,
         0.00060569, 0.00060569, 0.00060569, 0.00121139, 0.00121139,
         0.00060569, 0.00060569, 0.00060569, 0.00060569, 0.00060569,
         0.00060569, 0.00060569, 0.00242277, 0.00060569, 0.00060569,
         0.00181708, 0.00060569, 0.00060569, 0.00060569, 0.00121139,
         0.00060569, 0.00060569, 0.00060569, 0.00060569, 0.00060569,
         0.00181708, 0.00060569, 0.00060569, 0.00060569, 0.00060569,
         0.00121139, 0.00060569, 0