In [15]:
import pandas as pd
import numpy as np
import load_data as ld

In [16]:
data = ld.loadData()

In [22]:
def train(matrix, labels):
    """
    @matrix: cols : wordcounts, row: mails
    @labels: spam not spam
    @returns: p_x_spam, p_x_non_spam, p_tot_spam, p_tot_non_spam
    """
    # Convert matrix to dataframe
    df = pd.DataFrame(matrix)
    print('Matrix:\n ', df.head(5))

    # TrainLabels represents an array consisting of 0/1 entries. The ith entry of this array
    # indicates whether the ith document is spam (1 = spam, 0 = non-spam).
    print('Labels:\n ', labels[:5], 'Size: ', labels.shape)

    # Add column spam from labels
    df['spam'] = labels
    print('Matrix with labels:\n ', df.head(5))

    # Calculate the number of messages that are spam
    print('Number of columns with are marked as spam: ', df[df['spam'] == 1][1].count())

    # Calculate the probability of spam messages
    pspam = df[df['spam'] == 1][1].count() / len(df)
    print('Probability of spam messages', pspam)

    # Calculate the probability of non-spam messages
    pnonspam = df[df['spam'] == 0][1].count() / len(df)
    print('Probability of non-spam messages', pnonspam)

    bag_spam = df[df['spam'] == 1].sum()
    print('Number of spam words within each message (bag_spam):\n', bag_spam[:5])

    bag_nonspam = df[df['spam'] == 0].sum()
    print('Number of non-spam words within each message (nonbag_spam):\n', bag_nonspam[:5])

    bag_spam_totalwords = bag_spam.drop('spam').sum()
    print('bag_spam_totalwords', bag_spam_totalwords)

    bag_non_spam_totalwords = bag_nonspam.drop('spam').sum()
    print('bag_non_spam_totalwords', bag_non_spam_totalwords)

    p_spam = bag_spam / bag_spam_totalwords
    print('Probability of bag_spam words per message:\n', p_spam.head(5))
    p_not_spam = bag_nonspam / bag_non_spam_totalwords
    print('Probability of of non_bag_spam words per message:\n', p_not_spam.head(5))

    return p_spam.drop('spam').values, p_not_spam.drop('spam').values, pspam, pnonspam

In [23]:
train(data['trainMatrix'], data['trainLabels'])

Matrix:
     0     1     2     3     4     5     6     7     8     9     ...  1438  \
0     0     0     0     1     0     0     0     0     0     0  ...     0   
1     0     0     0     0     0     0     0     0     0     0  ...     0   
2     0     0     0     0     0     0     0     0     0     0  ...     0   
3     0     0     0     0     0     0     0     0     0     0  ...     0   
4     0     0     0     0     0     0     0     0     0     0  ...     0   

   1439  1440  1441  1442  1443  1444  1445  1446  1447  
0     0     0     0     0     0     0     0     0     0  
1     0     0     0     2     0     0     0     0     0  
2     0     0     0     0     0     0     0     0     0  
3     0     0     0     0     0     0     0     0     0  
4     0     0     0     0     0     0     0     0     0  

[5 rows x 1448 columns]
Labels:
  [1 0 1 0 0] Size:  (2144,)
Matrix with labels:
     0  1  2  3  4  5  6  7  8  9  ...  1439  1440  1441  1442  1443  1444  \
0  0  0  0  1  0  0  0  0

(array([0.00026494, 0.0008232 , 0.00011354, ..., 0.00038795, 0.00292378,
        0.00013247]),
 array([2.17891626e-04, 9.39657638e-04, 1.49800493e-04, ...,
        1.36182266e-05, 5.71965519e-04, 1.77036946e-04]),
 0.49906716417910446,
 0.5009328358208955)

In [24]:
def test(test_data, test_labels, p_x_spam, p_x_non_spam, p_tot_spam, p_tot_non_spam):
    """
    returns error rate, predicted labels
    """
    p_m_spam = [np.prod(np.where(td >= 1, p_x_spam ** td, 1)) for td in test_data]
    p_m_non_spam = [np.prod(np.where(td >= 1, p_x_non_spam ** td, 1)) for td in test_data]
    p_m_spam = np.array(p_m_spam)
    p_m_non_spam = np.array(p_m_non_spam)
    p_m_and_spam = p_m_spam * p_tot_spam
    p_m_and_non_spam = p_m_non_spam * p_tot_non_spam

    correct =  sum((p_m_and_spam > p_m_and_non_spam) == test_labels)
    return (len(test_labels) - correct) / len(test_labels), (p_m_and_spam > p_m_and_non_spam)



In [26]:
test(data['trainMatrixEx1'],
     data['testLabelsEx1'],
     *train(data['trainMatrixEx1'],
            data['trainLabelsEx1'])
     )

Matrix:
     0  1  2  3  4
0  1  1  0  0  1
1  1  1  0  0  1
2  0  1  1  0  0
3  1  1  1  0  0
4  0  0  1  0  0
Labels:
  [1 0 1 1 1] Size:  (8,)
Matrix with labels:
     0  1  2  3  4  spam
0  1  1  0  0  1     1
1  1  1  0  0  1     0
2  0  1  1  0  0     1
3  1  1  1  0  0     1
4  0  0  1  0  0     1
Number of columns with are marked as spam:  6
Probability of spam messages 0.75
Probability of non-spam messages 0.25
Number of spam words within each message (bag_spam):
 0    3
1    3
2    5
3    1
4    3
dtype: int64
Number of non-spam words within each message (nonbag_spam):
 0    2
1    1
2    1
3    1
4    2
dtype: int64
bag_spam_totalwords 15
bag_non_spam_totalwords 7
Probability of bag_spam words per message:
 0    0.200000
1    0.200000
2    0.333333
3    0.066667
4    0.200000
dtype: float64
Probability of of non_bag_spam words per message:
 0    0.285714
1    0.142857
2    0.142857
3    0.142857
4    0.285714
dtype: float64


  correct =  sum((p_m_and_spam > p_m_and_non_spam) == test_labels)


TypeError: 'bool' object is not iterable

In [29]:
test(data['trainMatrixEx2'],
     data['trainLabelsEx2'],
     *train(data['trainMatrixEx2'],
            data['trainLabelsEx2'])
     )

Matrix:
     0  1  2  3  4
0  2  1  0  3  0
1  4  1  0  0  2
2  6  0  1  0  0
3  1  1  2  0  0
4  0  0  1  1  1
Labels:
  [1 1 1 0 0] Size:  (8,)
Matrix with labels:
     0  1  2  3  4  spam
0  2  1  0  3  0     1
1  4  1  0  0  2     1
2  6  0  1  0  0     1
3  1  1  2  0  0     0
4  0  0  1  1  1     0
Number of columns with are marked as spam:  3
Probability of spam messages 0.375
Probability of non-spam messages 0.625
Number of spam words within each message (bag_spam):
 0    12
1     2
2     1
3     3
4     2
dtype: int64
Number of non-spam words within each message (nonbag_spam):
 0    4
1    2
2    7
3    2
4    5
dtype: int64
bag_spam_totalwords 20
bag_non_spam_totalwords 20
Probability of bag_spam words per message:
 0    0.60
1    0.10
2    0.05
3    0.15
4    0.10
dtype: float64
Probability of of non_bag_spam words per message:
 0    0.20
1    0.10
2    0.35
3    0.10
4    0.25
dtype: float64


(0.0, array([ True,  True,  True, False, False, False, False, False]))

In [25]:
test(data['testMatrixEx1'],
     data['testLabelsEx1'],
     *train(data['trainMatrixEx1'],
            data['trainLabelsEx1'])
     )

Matrix:
     0  1  2  3  4
0  1  1  0  0  1
1  1  1  0  0  1
2  0  1  1  0  0
3  1  1  1  0  0
4  0  0  1  0  0
Labels:
  [1 0 1 1 1] Size:  (8,)
Matrix with labels:
     0  1  2  3  4  spam
0  1  1  0  0  1     1
1  1  1  0  0  1     0
2  0  1  1  0  0     1
3  1  1  1  0  0     1
4  0  0  1  0  0     1
Number of columns with are marked as spam:  6
Probability of spam messages 0.75
Probability of non-spam messages 0.25
Number of spam words within each message (bag_spam):
 0    3
1    3
2    5
3    1
4    3
dtype: int64
Number of non-spam words within each message (nonbag_spam):
 0    2
1    1
2    1
3    1
4    2
dtype: int64
bag_spam_totalwords 15
bag_non_spam_totalwords 7
Probability of bag_spam words per message:
 0    0.200000
1    0.200000
2    0.333333
3    0.066667
4    0.200000
dtype: float64
Probability of of non_bag_spam words per message:
 0    0.285714
1    0.142857
2    0.142857
3    0.142857
4    0.285714
dtype: float64


(0.3333333333333333, array([ True,  True,  True, False,  True,  True]))

In [9]:
test(data['testMatrixEx2'], 
     data['testLabelsEx2'],
     *train(data['trainMatrixEx2'],
           data['trainLabelsEx2'])
)

Matrix:
     0  1  2  3  4
0  2  1  0  3  0
1  4  1  0  0  2
2  6  0  1  0  0
3  1  1  2  0  0
4  0  0  1  1  1
Labels:
  [1 1 1 0 0] Size:  (8,)
Matrix with labels:
     0  1  2  3  4  spam
0  2  1  0  3  0     1
1  4  1  0  0  2     1
2  6  0  1  0  0     1
3  1  1  2  0  0     0
4  0  0  1  1  1     0
Number of columns with are marked as spam:  3
Probability of spam messages 0.375
Probability of non-spam messages 0.625
Number of spam words within each message (bag_spam):
 0    12
1     2
2     1
3     3
4     2
dtype: int64
Number of non-spam words within each message (nonbag_spam):
 0    4
1    2
2    7
3    2
4    5
dtype: int64
bag_spam_totalwords 20
bag_non_spam_totalwords 20
Probability of bag_spam words per message:
 0    0.60
1    0.10
2    0.05
3    0.15
4    0.10
dtype: float64
Probability of of non_bag_spam words per message:
 0    0.20
1    0.10
2    0.35
3    0.10
4    0.25
dtype: float64


(0.16666666666666666, array([ True,  True, False, False, False, False]))

In [10]:
def train_laplace(matrix, labels): 
    """
    @matrix: cols : wordcounts, row: mails
    @labels: spam not spam
    @returns: p_x_spam, p_x_non_spam, p_tot_spam, p_tot_non_spam
    """
    df = pd.DataFrame(matrix)
    df['spam'] = labels
    pspam = df[df['spam'] == 1][1].count() / len(df)
    pnonspam = df[df['spam'] == 0][1].count() / len(df)
    bag_spam = df[df['spam'] == 1].sum()
    bag_nonspam = df[df['spam'] == 0].sum()
    bag_spam_totalwords = bag_spam.drop('spam').sum()
    bag_non_spam_totalwords = bag_nonspam.drop('spam').sum()
    p_spam = (bag_spam + np.ones(bag_spam.values.shape)) / (bag_spam_totalwords + len(df.columns)-1)
    p_not_spam = (bag_nonspam + np.ones(bag_nonspam.values.shape))/ (bag_non_spam_totalwords + len(df.columns)-1)
    return p_spam.drop('spam').values, p_not_spam.drop('spam').values, pspam, pnonspam


def test_laplace(test_data, test_labels, p_x_spam, p_x_non_spam, p_tot_spam, p_tot_non_spam):
    """
    returns error rate, predicted labels
    """
    p_m_spam = ([np.prod(np.where(td >= 1, p_x_spam ** td, 1)) for td in test_data])
    p_m_non_spam = ([np.prod(np.where(td >= 1, p_x_non_spam ** td, 1)) for td in test_data])
    p_m_spam = np.log(np.array(p_m_spam))
    p_m_non_spam = np.log(np.array(p_m_non_spam))
    p_m_and_spam = p_m_spam * p_tot_spam
    p_m_and_non_spam = p_m_non_spam * p_tot_non_spam
    
    correct =  sum((p_m_and_spam > p_m_and_non_spam) == test_labels)
    return (len(test_labels) - correct) / len(test_labels), (p_m_and_spam > p_m_and_non_spam)


In [11]:
comp = test_laplace(data['testMatrixEx1'], 
     data['testLabelsEx1'],
     *train_laplace(data['trainMatrixEx1'],
           data['trainLabelsEx1'])
)

act = train_laplace(data['trainMatrixEx1'],
           data['trainLabelsEx1'])


act, comp

((array([0.2, 0.2, 0.3, 0.1, 0.2]),
  array([0.25      , 0.16666667, 0.16666667, 0.16666667, 0.25      ]),
  0.75,
  0.25),
 (0.5, array([False, False, False, False, False, False])))

In [12]:
comp = test_laplace(data['testMatrixEx2'], 
     data['testLabelsEx2'],
     *train_laplace(data['trainMatrixEx2'],
           data['trainLabelsEx2'])
)

act = train_laplace(data['trainMatrixEx2'],
           data['trainLabelsEx2'])


act, comp

((array([0.52, 0.12, 0.08, 0.16, 0.12]),
  array([0.2 , 0.12, 0.32, 0.12, 0.24]),
  0.375,
  0.625),
 (0.8333333333333334, array([ True,  True,  True,  True,  True,  True])))

In [13]:
comp = test_laplace(data['testMatrixEx3'], 
     data['testLabelsEx3'],
     *train_laplace(data['trainMatrixEx3'],
           data['trainLabelsEx3'])
)

act = train_laplace(data['trainMatrixEx3'],
           data['trainLabelsEx3'])


comp

  p_m_spam = np.log(np.array(p_m_spam))
  p_m_non_spam = np.log(np.array(p_m_non_spam))


(0.7,
 array([ True,  True,  True,  True,  True, False,  True,  True, False,
         True]))

In [14]:
np.log(np.arange(0.01, 0.9, 0.01))

array([-4.60517019, -3.91202301, -3.5065579 , -3.21887582, -2.99573227,
       -2.81341072, -2.65926004, -2.52572864, -2.40794561, -2.30258509,
       -2.20727491, -2.12026354, -2.04022083, -1.96611286, -1.89711998,
       -1.83258146, -1.77195684, -1.71479843, -1.66073121, -1.60943791,
       -1.56064775, -1.51412773, -1.46967597, -1.42711636, -1.38629436,
       -1.34707365, -1.30933332, -1.27296568, -1.23787436, -1.2039728 ,
       -1.17118298, -1.13943428, -1.10866262, -1.07880966, -1.04982212,
       -1.02165125, -0.99425227, -0.96758403, -0.94160854, -0.91629073,
       -0.89159812, -0.86750057, -0.84397007, -0.82098055, -0.7985077 ,
       -0.77652879, -0.75502258, -0.73396918, -0.71334989, -0.69314718,
       -0.67334455, -0.65392647, -0.63487827, -0.61618614, -0.597837  ,
       -0.5798185 , -0.56211892, -0.54472718, -0.52763274, -0.51082562,
       -0.49429632, -0.4780358 , -0.46203546, -0.4462871 , -0.43078292,
       -0.41551544, -0.40047757, -0.38566248, -0.37106368, -0.35