In [213]:
!pip install mlcroissant
!pip install pandas



In [263]:
import mlcroissant as mlc
import pandas as pd
import string
from collections import OrderedDict
from sklearn.model_selection import train_test_split

In [274]:

# Fetch the Croissant JSON-LD
croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/jackksoncsie/spam-email-dataset/croissant/download')

# Check what record sets are in the dataset
record_sets = croissant_dataset.metadata.record_sets
print(record_sets)

# Fetch the records and put them in a DataFrame
df = pd.DataFrame(croissant_dataset.records(record_set=record_sets[0].uuid))
df = df.rename(columns = {
    'emails.csv/text': 'body',
    'emails.csv/spam': 'is_spam'
})
df['body'] = df['body'].apply(lambda x: x.decode())
df['is_spam'] = df['is_spam'].apply(lambda x: False if x == 0 else True)
df

  -  [Metadata(Spam email Dataset)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.


[RecordSet(uuid="emails.csv")]


Unnamed: 0,body,is_spam
0,Subject: naturally irresistible your corporate...,True
1,Subject: the stock trading gunslinger fanny i...,True
2,Subject: unbelievable new homes made easy im ...,True
3,Subject: 4 color printing special request add...,True
4,"Subject: do not have money , get software cds ...",True
...,...,...
5723,Subject: re : research and development charges...,False
5724,"Subject: re : receipts from visit jim , than...",False
5725,Subject: re : enron case study update wow ! a...,False
5726,"Subject: re : interest david , please , call...",False


In [276]:
ham_count = df[df.is_spam == False].size
spam_count = df[df.is_spam == True].size

print(ham_count, spam_count)

8720 2736


In [277]:
def clean_text(t):
    t = str.lower(t[8:])

    # filter non ascii characters
    t = ''.join(filter(lambda x: x in string.ascii_letters + ' ', t))

    # remove small words
    t = ' '.join(filter(lambda x: len(x) > 2, t.split()))    
    return t

def get_word_set(X):
    unique_words = {w for l in X for w in l.split()} 
    return unique_words

def count_words(text, known_words):
    t = clean_text(text)
    return set(t.split())

def count_words_list(texts, known_words):
    text_words = [w for row in texts for w in row.split()]
    return count_words(' '.join(text_words), known_words)

In [278]:
df['clean_body'] = df.body.apply(clean_text)
df.clean_body

0       naturally irresistible your corporate identity...
1       the stock trading gunslinger fanny merrill but...
2       unbelievable new homes made easy wanting show ...
3       color printing special request additional info...
4       not have money get software cds from here soft...
                              ...                        
5723    research and development charges gpg here forw...
5724    receipts from visit jim thanks again for the i...
5725    enron case study update wow all the same day t...
5726    interest david please call shirley crenshaw as...
5727    news aurora update aurora version the fastest ...
Name: clean_body, Length: 5728, dtype: object

In [279]:
df.iloc[-1]

body          Subject: news : aurora 5 . 2 update  aurora ve...
is_spam                                                   False
clean_body    news aurora update aurora version the fastest ...
Name: 5727, dtype: object

In [280]:
class BayesFilter():
    def __init__(self, c_threshold = 100):
        self.c_threshold = c_threshold
        
    def train(self, train_data):
        known_words = get_word_set(train_data.clean_body.to_list())

        ham_data = train_data[train_data.is_spam == False]
        X_ham = ham_data.clean_body

        spam_data = train_data[train_data.is_spam == True]
        X_spam = spam_data.clean_body

        # print(ham_data)

        
        self.ham_wbag = count_words_list(X_ham, known_words)
        self.spam_wbag = count_words_list(X_spam, known_words)
        
        self.prob_spam = spam_data.size / (spam_data.size + ham_data.size)
        
    def predict(self, body):
        body = clean_text(body)
        body_wordset = get_word_set([body])
        
        px_ham = len(set.intersection(body_wordset, self.ham_wbag)) / len(self.ham_wbag)
        px_spam = len(set.intersection(body_wordset, self.spam_wbag))  / len(self.spam_wbag)

        if px_ham == 0:
            px_ham = 0.001
        
        l1 = px_spam / px_ham
        l2 = self.prob_spam / (1 - self.prob_spam)      
        return l1 * l2 > self.c_threshold

    def __str__(self):
        return f'BayesFilter(c_threshold: {self.c_threshold}, prob_spam: {self.prob_spam})'
    

In [322]:
train_data, test_data = train_test_split(df, test_size = 0.3)

f = BayesFilter(0.42)
f.train(train_data)

str(f)

'BayesFilter(c_threshold: 0.42, prob_spam: 0.23621850835619856)'

In [323]:
test_data['predict'] = test_data.clean_body.apply(f.predict)

test_successes = test_data[test_data.predict == test_data.is_spam]
test_errors = test_data[test_data.predict != test_data.is_spam]

accuracy = len(test_successes) / len(test_data)
false_positives = test_errors[test_errors.is_spam == True]
false_negatives = test_errors[test_errors.is_spam == False]

print(f'test_size: {len(test_data)}')
print(f'correctly predicted: {len(test_successes)}')
print()
print(f'accuracy: {accuracy}')
print(f'false_positives: {len(false_positives)}')
print(f'false_negatives: {len(false_negatives)}')
print()

test_data[['body', 'is_spam', 'predict']]

test_size: 1719
correctly predicted: 1529

accuracy: 0.8894706224549156
false_positives: 2
false_negatives: 188



Unnamed: 0,body,is_spam,predict
4789,Subject: correction : interim report to gary h...,False,False
2560,Subject: from the enron india newsdesk - may 5...,False,False
4934,Subject: thanks hi keith ! thanks so much fo...,False,True
4042,"Subject: re : spring 2001 schematic kathy , ...",False,False
4139,"Subject: credit model vince and stinson , we...",False,False
...,...,...,...
2194,Subject: re : extreme value theory applied to ...,False,False
106,Subject: free 1 week dvd downloads we are hap...,True,True
111,Subject: best prescription generic meds 4 less...,True,True
2170,Subject: re : mission impossible - hr associat...,False,False


In [332]:
f.spam_wbag - f.ham_wbag

{'summertime',
 'bohsg',
 'rafn',
 'cadastrado',
 'knoweledge',
 'mausoleum',
 'maubev',
 'secondcategories',
 'mort',
 'biochemistry',
 'harmonline',
 'rechargeable',
 'widest',
 'christendom',
 'superlative',
 'sconce',
 'reposio',
 'goldschen',
 'completo',
 'lotsonet',
 'alterman',
 'allmerica',
 'staccato',
 'tribe',
 'ochoa',
 'slaughtered',
 'prop',
 'diapers',
 'hcdc',
 'seiected',
 'horowitz',
 'controleer',
 'abnormality',
 'decidedly',
 'porous',
 'quadrat',
 'dastard',
 'mentally',
 'beep',
 'shouid',
 'unsatisfied',
 'eurekahedge',
 'letzte',
 'engages',
 'xana',
 'alabi',
 'limitation',
 'patented',
 'gastronomist',
 'emile',
 'nttpc',
 'mainoemstore',
 'olce',
 'patriotic',
 'raghallaigh',
 'motorised',
 'operateout',
 'vaiuabie',
 'grumbled',
 'whoily',
 'trauma',
 'confiden',
 'impartial',
 'somatroph',
 'alpan',
 'spinnin',
 'techncial',
 'compliant',
 'cultivable',
 'mighty',
 'phiri',
 'urinary',
 'caption',
 'priceless',
 'automaticaiiy',
 'athlete',
 'geography',
