# Solución Filtrado de mensajes spam

### Por Luis Miguel Morales Álvarez

In [58]:
import zipfile
import email
import re
import html2text
import base64
from pattern.en import conjugate, INFINITIVE

ignored_words = ['a', 'you', 'my', 'me', 'is', 'the', 'to', 'and', 'or', 'not', 'an', 'from', 'by', 'in', 'of', 'at', 'for', 'any', 'null', 'with', 'are', 'this', 'if', 'td', 'tr', 'us', 'all', 'am', 'pm', 'because', 'been', 'we', 'very', 'br', 'div', 'it', 'don', 'doesn', 'do', 'on', 'off', 'ye', 'php', 'www', 'al']

def extract_words(message):
    words = re.sub("[^\w]", " ", message).split()
    return [conjugate(w.lower(), tense=INFINITIVE) for w in words if (w.lower() not in ignored_words) and w.isalpha() and len(w) > 1]

def read_multipart(part):
    body = ''
    if part.is_multipart():
        for sub in part.get_payload():
            subbody, encoding = read_multipart(sub)
            body += subbody + '\n'
    else:
        content_type = part.get_content_type()
        if not content_type.startswith('text/'):
            return '', None
        body = part.get_payload()
        encoding = part['Content-Transfer-Encoding']
    return body, encoding

def try_decode(message, encoding):
    if encoding == 'base64':
        return base64.b64decode(re.sub(r'\s+', '', message)).decode("utf-8")
    return message

h = html2text.HTML2Text()
def process_content(file):
    mail = email.message_from_bytes(file.read())
    contentType = mail.get_content_type()  
    message, encoding = read_multipart(mail)
    if contentType == 'text/html':
        message = h.handle(try_decode(message, encoding))
    else:
        message = try_decode(message, encoding)
    return message

def create_bag():
    return {
        'easy_ham': {},
        'easy_ham_2': {},
        'hard_ham': {},
        'hard_ham_2': {},
        'spam': {},
        'spam_2': {}
    }

filecount = 0
discarded = 0
bags = create_bag()
zip = zipfile.ZipFile('datos/spam-sms.zip')
for filename in zip.namelist():
    if filename.endswith('/') or filename.endswith('/cmds') or filename.endswith('.DS_Store'):
        continue

    #print(filename)
    try:
        folder = filename.split('/')[1]
        file = zip.open(filename)
        message = process_content(file)
        words = extract_words(message)        
        bags[folder][filename] = words
        
    except:
        discarded += 1
        continue
    else:
        filecount += 1

print("{0} files processed".format(filecount))
print("{0} files discarded".format(discarded))
print(bags)

100 files processed
1 files discarded
{'easy_ham': {}, 'easy_ham_2': {}, 'hard_ham': {}, 'hard_ham_2': {}, 'spam': {'spam-filter/spam/00249.5f45607c1bffe89f60ba1ec9f878039a': ['dear', 'homeowner', 'interest', 'rate', 'their', 'lowest', 'point', 'year', 'help', 'find', 'best', 'rate', 'your', 'situation', 'match', 'your', 'need', 'hundred', 'lender', 'home', 'improvement', 'refinance', 'second', 'mortgage', 'home', 'equity', 'loan', 'more', 'even', 'les', 'than', 'perfect', 'credit', 'service', 'free', 'home', 'owner', 'new', 'home', 'buyer', 'without', 'obligation', 'just', 'fill', 'out', 'quick', 'simple', 'form', 'jump', 'start', 'your', 'future', 'plan', 'today', 'visit', 'http', 'index', 'asp', 'afft', 'unsubscribe', 'please', 'visit', 'http', 'light', 'watch', 'asp'], 'spam-filter/spam/00373.ebe8670ac56b04125c25100a36ab0510': ['attention', 'must', 'computer', 'user', 'new', 'special', 'package', 'deal', 'norton', 'systemwork', 'software', 'suite', 'professional', 'edition', 'inclu

In [59]:
def count_words(word_list):
    hist = {}
    for word in word_list:
        hist[word] = hist[word] + 1 if word in hist else 1
    return hist

bag_hist = create_bag()
for (key, messages) in bags.items():
    for (message, words) in messages.items():
        bag_hist[key][message] = count_words(words)
        
print(bag_hist)
        

{'easy_ham': {}, 'easy_ham_2': {}, 'hard_ham': {}, 'hard_ham_2': {}, 'spam': {'spam-filter/spam/00249.5f45607c1bffe89f60ba1ec9f878039a': {'dear': 1, 'homeowner': 1, 'interest': 1, 'rate': 2, 'their': 1, 'lowest': 1, 'point': 1, 'year': 1, 'help': 1, 'find': 1, 'best': 1, 'your': 3, 'situation': 1, 'match': 1, 'need': 1, 'hundred': 1, 'lender': 1, 'home': 4, 'improvement': 1, 'refinance': 1, 'second': 1, 'mortgage': 1, 'equity': 1, 'loan': 1, 'more': 1, 'even': 1, 'les': 1, 'than': 1, 'perfect': 1, 'credit': 1, 'service': 1, 'free': 1, 'owner': 1, 'new': 1, 'buyer': 1, 'without': 1, 'obligation': 1, 'just': 1, 'fill': 1, 'out': 1, 'quick': 1, 'simple': 1, 'form': 1, 'jump': 1, 'start': 1, 'future': 1, 'plan': 1, 'today': 1, 'visit': 2, 'http': 2, 'index': 1, 'asp': 2, 'afft': 1, 'unsubscribe': 1, 'please': 1, 'light': 1, 'watch': 1}, 'spam-filter/spam/00373.ebe8670ac56b04125c25100a36ab0510': {'attention': 1, 'must': 1, 'computer': 3, 'user': 1, 'new': 1, 'special': 2, 'package': 1, 'dea

In [60]:
def cut_count(hist, threshold):
    cut = {}
    for (word, count) in hist.items():
        if count > threshold:
            cut[word] = count
    return cut

trimmed_bag = create_bag()
for (key, messages) in bag_hist.items():
    for (message, hist) in messages.items():
        trimmed_bag[key][message] = cut_count(hist, 2)

print(trimmed_bag)

{'easy_ham': {}, 'easy_ham_2': {}, 'hard_ham': {}, 'hard_ham_2': {}, 'spam': {'spam-filter/spam/00249.5f45607c1bffe89f60ba1ec9f878039a': {'your': 3, 'home': 4}, 'spam-filter/spam/00373.ebe8670ac56b04125c25100a36ab0510': {'computer': 3, 'your': 9}, 'spam-filter/spam/00214.1367039e50dc6b7adb0f2aa8aba83216': {'dare': 4, 'find': 3, 'better': 3, 'annuity': 3, 'year': 4, 'up': 4, 'more': 4, 'information': 4, 'form': 4, 'name': 11, 'mail': 4, 'phone': 3, 'city': 3, 'state': 3, 'pmg': 11, 'deposit': 4, 'receive': 4, 'professional': 4, 'http': 17, 'insuranceiq': 4, 'com': 7, 'optout': 4, 'legal': 4, 'html': 3, 'type': 10, 'text': 7, 'bgcolor': 5, 'font': 24, 'face': 3, 'arial': 3, 'helvetica': 3, 'san': 3, 'serif': 3, 'table': 8, 'width': 22, 'border': 4, 'align': 12, 'center': 8, 'cellpad': 4, 'cellspace': 4, 'ffffff': 3, 'img': 8, 'src': 8, 'iiq': 9, 'image': 9, 'gif': 7, 'alt': 5, 'height': 8, 'size': 16, 'colspan': 5, 'right': 6, 'input': 9, 'value': 4, 'hide': 3}, 'spam-filter/spam/00210.0

In [61]:
import pandas as pd 

matrices = {}
for (key, messages) in trimmed_bag.items():
    matrices[key] = pd.DataFrame(messages)
    matrices[key].fillna(0, inplace=True)
    
print(matrices)
        

{'easy_ham': Empty DataFrame
Columns: []
Index: [], 'easy_ham_2': Empty DataFrame
Columns: []
Index: [], 'hard_ham': Empty DataFrame
Columns: []
Index: [], 'hard_ham_2': Empty DataFrame
Columns: []
Index: [], 'spam':                 spam-filter/spam/00249.5f45607c1bffe89f60ba1ec9f878039a  \
a                                                             0.0         
abidjan                                                       0.0         
abonnement                                                    0.0         
about                                                         0.0         
above                                                         0.0         
accept                                                        0.0         
access                                                        0.0         
account                                                       0.0         
acre                                                          0.0         
act                              

In [63]:
import numpy as np

for (key, matrix) in matrices.items():
    matrix.transform(lambda x: 1 if x > 0 else 0)
    
print(matrices)

ValueError: ('The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().', 'occurred at index spam-filter/spam/00249.5f45607c1bffe89f60ba1ec9f878039a')