**TOKENIZATION** 

First step of our journey is to TOKENIZE our Emails.

In [5]:
import csv
import re

Deleting the bad format inputs and return every words in the document.

In [6]:
def tokenize_email(text):
    words = re.findall(r'[a-zA-Z0-9]+', text)
    return [word.lower() for word in words]


Proccessing the .CSV input file.

In [7]:
input_path = './data/emails.csv'
output_path = './data/emails_tokenized.csv'

with open(input_path, 'r', encoding='utf-8') as infile, \
     open(output_path, 'w', encoding='utf-8', newline='') as outfile:

    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    
    writer.writerow(['tokens', 'spam'])
    
    next(reader, None)
    
    for row in reader:
        if len(row) < 2:
            continue
            
        text, label = row[0], row[1]
        tokens = tokenize_email(text)
        
        writer.writerow([tokens, label])

print("All done and saved in ",output_path)

All done and saved in  ./data/emails_tokenized.csv


**VECTORIZATION**



In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import save_npz

Reading the tokenized data and transforming it to **LIST**

In [11]:

tokenized_path = './data/emails_tokenized.csv'
df = pd.read_csv(tokenized_path)

df['tokens'] = df['tokens'].apply(eval)

vectorizing features with 2 way 

**#1** Count Vectorization

In [None]:
count_vectorizer = CountVectorizer(
    tokenizer=lambda x: x,  # استفاده از توکن‌های از پیش پردازش شده
    preprocessor=lambda x: x,  # عدم پیش‌پردازش اضافی
    binary=False,  # حالت شمارشی
    min_df=2  # نادیده گرفتن کلماتی که کمتر از ۲ بار ظاهر شده‌اند
)
X_count = count_vectorizer.fit_transform(df['tokens'])

**#2** Binarry Vectorization

In [13]:
binary_vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    binary=True, 
    min_df=2
)
X_binary = binary_vectorizer.fit_transform(df['tokens'])

Saving the **results** 

In [14]:
save_npz('./data/features_count.npz', X_count)
save_npz('./data/features_binary.npz', X_binary)


np.save('./data/labels.npy', df['spam'].values)


vocabulary = count_vectorizer.get_feature_names_out()
pd.Series(vocabulary).to_csv('./data/vocabulary.csv', index=False)

In [32]:
print(f"All features : {len(vocabulary)}")
print(f"Count : {X_count.shape}")
print(f"Binarry : {X_binary.shape}")

All features : 20303
Count : (5728, 20303)
Binarry : (5728, 20303)


In [30]:
from scipy.sparse import csr_matrix
import numpy as np

small_count = X_count[:5, :10].toarray()
small_binary = X_binary[:5, :10].toarray()

print("Count Matrix Sample:")
print(small_count)
print("\nBinary Matrix Sample:")
print(small_binary)

Count Matrix Sample:
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]

Binary Matrix Sample:
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]
