In [3]:
!pip install mlcroissant
!pip install pandas

Collecting mlcroissant
  Downloading mlcroissant-1.0.9-py2.py3-none-any.whl.metadata (10 kB)
Collecting absl-py (from mlcroissant)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting etils>=1.7.0 (from etils[epath]>=1.7.0->mlcroissant)
  Downloading etils-1.9.4-py3-none-any.whl.metadata (6.4 kB)
Collecting jsonpath-rw (from mlcroissant)
  Downloading jsonpath-rw-1.4.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting pandas-stubs (from mlcroissant)
  Downloading pandas_stubs-2.2.3.241009-py3-none-any.whl.metadata (10.0 kB)
Collecting rdflib (from mlcroissant)
  Downloading rdflib-7.0.0-py3-none-any.whl.metadata (11 kB)
Collecting ply (from jsonpath-rw->mlcroissant)
  Downloading ply-3.11-py2.py3-none-any.whl.metadata (844 bytes)
Collecting types-pytz>=2022.1.1 (from pandas-stubs->mlcroissant)
  Downloading types_pytz-2024.2.0.20241003-py3-none-any.whl.metadata (1.6 kB)
Collecting isodate<0.7.0,>=0.6.0 (from rdflib->mlcroissant)
  Downloading

In [87]:
import mlcroissant as mlc
import pandas as pd
import string
from collections import OrderedDict


In [49]:

# Fetch the Croissant JSON-LD
croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/jackksoncsie/spam-email-dataset/croissant/download')

# Check what record sets are in the dataset
record_sets = croissant_dataset.metadata.record_sets
print(record_sets)

# Fetch the records and put them in a DataFrame
df = pd.DataFrame(croissant_dataset.records(record_set=record_sets[0].uuid))
df = df.rename(columns = {
    'emails.csv/text': 'body',
    'emails.csv/spam': 'is_spam'
})
df['body'] = df['body'].apply(lambda x: x.decode())
df

  -  [Metadata(Spam email Dataset)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.


[RecordSet(uuid="emails.csv")]


Unnamed: 0,body,is_spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [54]:
ham_count = df[df.is_spam == 0].size
spam_count = df[df.is_spam == 1].size

print(ham_count, spam_count)

8720 2736


In [187]:
def clean_text(t):
    t = str.lower(t[8:])

    # filter non ascii characters
    t = ''.join(filter(lambda x: x in string.ascii_letters + ' ', t))

    # remove small words
    t = ' '.join(filter(lambda x: len(x) > 2, t.split()))    
    return t

def get_word_set(X):
    unique_words = {w for l in X for w in l.split()} 
    return unique_words

def count_words(text, known_words):
    t = clean_text(text)
    bag_of_words = {}
    for w in t.split():
        if not w in known_words:
            continue

        if w not in bag_of_words:
            bag_of_words[w] = 0
            
        bag_of_words[w] += 1
    return bag_of_words

def count_words_list(texts, known_words):
    text_words = [w for row in texts for w in row.split()]
    return count_words(' '.join(text_words), known_words)

In [188]:
df['clean_body'] = df.body.apply(clean_text)
df.clean_body

0       naturally irresistible your corporate identity...
1       the stock trading gunslinger fanny merrill but...
2       unbelievable new homes made easy wanting show ...
3       color printing special request additional info...
4       not have money get software cds from here soft...
                              ...                        
5723    research and development charges gpg here forw...
5724    receipts from visit jim thanks again for the i...
5725    enron case study update wow all the same day t...
5726    interest david please call shirley crenshaw as...
5727    news aurora update aurora version the fastest ...
Name: clean_body, Length: 5728, dtype: object

In [207]:
df.iloc[-1].body

'Subject: news : aurora 5 . 2 update  aurora version 5 . 2  - the fastest model just got faster -  epis announces the release of aurora , version 5 . 2  aurora the electric market price forecasting tool is already  legendary for power and speed . we \' ve combined a powerful chronological  dispatch model with the capability to simulate the market from 1  day to 25 + years . add to that a risk analysis section , powered by user  selectable monte carlo & / or latin hypercube modeling , enough  portfolio analysis power to please the toughest critic , & inputs and  outputs from standard excel & access tables and you \' ve got one of most  powerful tools in the market .  just a few months ago we expanded our emissions modeling  capabilities , added our quarterly database update , increased the speed  of the entire model , and made  but that wasn \' t enough .  we \' ve done it again . some of the operations that we \' ve  included . . .  two new reporting enhancements .  the first is margin

In [204]:
class BayesFilter():
    def __init__(self, c_threshold = 100):
        self.c_threshold = c_threshold
        
    def train(self, train_data):
        known_words = get_word_set(train_data.clean_body.to_list())

        ham_data = train_data[train_data.is_spam == 0]
        X_ham = ham_data.clean_body

        spam_data = train_data[train_data.is_spam == 1]
        X_spam = spam_data.clean_body

        self.ham_wbag = count_words_list(X_ham, known_words)
        self.spam_wbag = count_words_list(X_spam, known_words)
        
        self.prob_spam = spam_data.size / (spam_data.size + ham_data.size)
        
    def predict(self, body):
        body = clean_text(body)
        body_wordset = get_word_set([body])
        
        px_ham = body_wordset - set(self.ham_wbag.keys())
        print(px_ham)

        px_spam = body_wordset - set(self.spam_wbag.keys())
        print(px_spam)
        
        # l2 = self.
        # l2 = self.prob_spam / (1 - self.prob_ham)
        pass

    def __str__(self):
        return f'BayesFilter(c_threshold: {self.c_threshold}, prob_spam: {self.prob_spam})'
f = BayesFilter()
f.train(df)
f.predict(df.body.iloc[-1])
# train_data = df[df.is_spam == 0]
# df


set()
{'emissions', 'enhances', 'outputs', 'scripting', 'toughest', 'carlo', 'latin', 'emission', 'stack', 'marginal', 'legendary', 'aurora', 'hourly', 'epis', 'exploit', 'selectable', 'desiring', 'wheeler', 'monte', 'simulate', 'hypercube', 'chronological', 'critic'}
