In [1]:
DATA_DIR = "./data"

In [2]:
class Document(object):
    def __init__(self, data):
        self.documentId = data['documentId']
        self.time = data['time']
        self.sender = data['sender']
        self.sender_id = data['sender_id']
        self.body = data['message']

In [3]:
import os
import email
import json
from collections import defaultdict

def clean_body(body):
    patterns = [
        "-----Original Message-----",
        "***************************",
        "----------------------",
    ]
    for pattern in patterns:
        index = body.find(pattern)
        if index != -1:
            body = body[:index]
    return body.strip()


users = {}
corpus = defaultdict(list)

def load_users():
    with open(os.path.join(DATA_DIR, "users.json")) as f:
        for k, v in json.loads(f.read()).items():
            users[v] = k

def load_corpus():
    index = 0
    with open(os.path.join(DATA_DIR, "messages.json")) as f:
        data = json.loads(f.read())
        for msg in data:
            body = clean_body(msg["message"])
            if not body:
                continue
            corpus[msg["sender"]].append(body)

In [4]:
load_users()
load_corpus()

In [5]:
import re

def sanitize(text):
    return re.sub('[^a-z]+', ' ', text.lower())

In [6]:
def tokenize(text):
    return text.split()

In [7]:
from collections import defaultdict, Counter

def build_inverted_index(corpus):
    index = defaultdict(dict)
    for sender, bodies in corpus.items():
        words_counter = Counter(tokenize(sanitize(" ".join(bodies))))
        for word, count in words_counter.items():
            index[word][sender] = count
    return index


def build_sender_word_count(corpus):
    index = defaultdict(dict)
    for sender, bodies in corpus.items():
        words_counter = Counter(tokenize(sanitize(" ".join(bodies))))
        for word, count in words_counter.items():
            index[sender][word] = count
    return index

In [8]:
inverted_index = build_inverted_index(corpus)
sender_word_count = build_sender_word_count(corpus)

### Term Frequency
How frequently a user uses the term

### Inverse Document Frequency
log(Number of users/number of users that uses the word)

In [27]:
stopwords = set(["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"])

In [12]:
sender_vocab = defaultdict(set)
for word, sender_count in inverted_index.items():
    senders = sender_count.keys()
    for sender in senders:
        sender_vocab[sender].add(word)

In [75]:
import math

def tfidf_v1(word, sender):
    try:
        tf = sender_word_count[sender][word]
        idf = math.log(len(users)/len(inverted_index[word]), math.e)
        return tf * idf
    except KeyError:
        return 0

In [58]:
import math

def get_informative_words_sender(sender):
    word_tfidf = []
    for word in sender_vocab[sender]:
        if word in stopwords:
            continue
        word_tfidf.append((word, tfidf_v1(word, sender)))
    return sorted(word_tfidf, key=lambda x: x[1], reverse=True)[:10]

In [43]:
get_informative_words_sender(123)

[('amazon', 3422.2071624364876),
 ('image', 2535.542984866394),
 ('obidos', 1380.1306201893176),
 ('mk', 1294.2466743249338),
 ('cust', 1239.8002185491016),
 ('hol', 1211.7536688134012),
 ('com', 1149.944468059166),
 ('ref', 1081.2836962747208),
 ('exec', 1053.2670613620144),
 ('preferences', 952.1873154123343)]

In [52]:
import random

def get_random_email():
    sender = random.choice(list(corpus.keys()))
    return random.choice(corpus[sender]), sender

In [53]:
get_random_email()

("Make $2000-$6000 Per Week From Home In The Next 90 Days Are You Looking To Make More Money? If you are currently looking to make more money or to add to your income or currently unemployed and have between 2 - 8 hrs per day. We have something for you that will change your current financial situation. It is absolutely 100% FREE, Don't miss out on a real opportunity! CLICK HERE FOR MORE INFORMATION",
 5779)

In [110]:
def who_wrote_it_v1(text):
    sender_tfidfs = defaultdict(list)
    tokens = tokenize(sanitize(text))
    for token in tokens:
        if token in stopwords:
            continue
        for sender in inverted_index[token]:
            tfidf = tfidf_v1(token, sender)
            sender_tfidfs[sender].append(tfidf)
    return sorted(sender_tfidfs.keys(), key=lambda x: max(sender_tfidfs[x]), reverse=True)

In [131]:
text, actual_sender = get_random_email()
who = who_wrote_it_v1(text)
print(actual_sender, actual_sender in who)

38343 False
