In [1]:
# HTML: https://s3.amazonaws.com/assets.mailcharts.com/emails/3b9666be-1ddc-952f-329e-6e832911892e/index.html
# JSON: https://s3.amazonaws.com/assets.mailcharts.com/~emails:3b9666be-1ddc-952f-329e-6e832911892e.json

In [75]:
from bs4 import BeautifulSoup
from collections import Counter
from collections import defaultdict
from io import BytesIO
import math
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag as word_type
import numpy as np
import pandas as pd
from PIL import Image
import pytesseract
import re
import requests
import string

In [76]:
# Google vision
from google.cloud import vision
from google.cloud.vision import types
import os

path="/Users/carl/sites/google.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS']=path

In [77]:
# Helper functions to prepare category data

def create_full_category(row):
    result = row.category_1.lower()
    
    if row.category_2:
        result = result + " > " + row.category_2.lower()
    if row.category_3:
        result = result + " > " + row.category_3.lower()
    if row.category_4:
        result = result + " > " + row.category_4.lower()
    if row.category_5:
        result = result + " > " + row.category_5.lower()
    if row.category_6:
        result = result + " > " + row.category_6.lower()
    if row.category_7:
        result = result + " > " + row.category_7.lower()

    return result

def get_lookup_value(row):    
    if row.lookup_value:
        return row.lookup_value.lower().split("|")
    if row.category_7:
        return [row.category_7.lower()]
    if row.category_6:
        return [row.category_6.lower()]
    if row.category_5:
        return [row.category_5.lower()]
    if row.category_4:
        return [row.category_4.lower()]
    if row.category_3:
        return [row.category_3.lower()]
    if row.category_2:
        return [row.category_2.lower()]
    if row.category_1:
        return [row.category_1.lower()]

In [78]:
# Create category DF
categories_df = pd.read_csv("./data/taxonomy-carl.csv", encoding="ISO-8859-1", dtype=str)
categories_df.fillna(False, inplace=True)
categories_df["full_category"] = categories_df.apply(lambda x: create_full_category(x), axis=1)
categories_df["lookup_value"] = categories_df.apply(lambda x: get_lookup_value(x), axis=1)

In [79]:
# Email text cleaner

lemmatizer = WordNetLemmatizer()
punctuation = r"\+|\:|\{|\\|\(|\-|\`|\<|\?|\*|\;|\_|\@|\'|\[|\}|\)|\,|\/|\"|\$|\=|\&|\]|\!|\%|\>|\^|\~|\||\.|\#"

def clean_email_text(content):
    if type(content) == float and np.isnan(content): # prevent blowup if nan
        return ""
    content = " ".join(content.split("_")) # prevents blowup if subjects are concatenated with "_"
    content = " ".join(content.split("\n"))
    lowered = str(content).lower()
    no_punct = re.sub(punctuation, "", lowered)
    alpha = " ".join([word for word in no_punct.split(" ") if word.isalpha()])
    nouns = " ".join([word[0] for word in word_type(alpha.strip().split(" ")) if word[1] != "VB"])
    singulars = " ".join([lemmatizer.lemmatize(word) for word in nouns.split(" ") if word.isalpha()])
    return singulars

In [80]:
# Content matching functions

def contains_word(s, w):
    return (' ' + w + ' ') in (' ' + s + ' ')

def contains_word_with_counter(i, s, w):
    return (i, s.count(w))

def find_matches(text):
    text = clean_email_text(text)
    categories = []
    
    for i, row in categories_df.iterrows():
        category_to_find = row["lookup_value"]
        for c in category_to_find:
            if contains_word(text, c):
                categories.append(contains_word_with_counter(i, text, c))
    return categories

In [81]:
# See which images we need to process

def get_image_size(url):
    data = requests.get(url).content
    im = Image.open(BytesIO(data))    
    return im.size

def get_images_to_analyze(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    images_to_analyze = []

    for img in soup.findAll('img'):
        img_url = "https://s3.amazonaws.com/assets.mailcharts.com/emails/" + guid + "/" + img["src"]
        try:
            width, height = get_image_size(img_url)
            area = width * height
            if area > 10000:
                images_to_analyze.append(img_url)
        except:
            pass

    return images_to_analyze


# OCR
# def process_image_ocr(images):
#     ocr_results = []
    
#     for image in images:
#         # TODO: We need to convert gifs to JPEGs
#         # https://pillow.readthedocs.io/en/5.1.x/reference/plugins.html#module-PIL.GifImagePlugin
#         if image.endswith(".gif"):
#             print("skipping gif")
#         else:
#             response = requests.get(image)
#             img = Image.open(BytesIO(response.content))
#             text = pytesseract.image_to_string(img)
#             ocr_results.append(text + " ")
#     return " ".join(ocr_results)

def get_images_data(images):
    text_results = ""
    image_results = ""
    
    for image in images:
        client = vision.ImageAnnotatorClient()
        request = {'image': {'source': {'image_uri': image},},}
        response = client.annotate_image(request)
        text = " ".join([l.description for l in response.text_annotations])
        label = " ".join([l.description for l in response.label_annotations])
        web = " ".join([l.description for l in response.web_detection.web_entities])

        text_results = text_results + text + " "
        image_results = image_results + label + " "

    return (text_results, image_results)

In [93]:
# image_content, image_text, subject_body

WEIGHTS = {
    "subject": 0.20,
    "body": 0.10,
    "image_text": 0.25,
    "image_content": 0.45
}

def calculate_result(subject, body, image_text, image_content): 
    # Calculate weight
    subject = [(s[0], s[1]*WEIGHTS["subject"]) for s in subject]
    body = [(s[0], s[1]*WEIGHTS["body"]) for s in body]
    image_text = [(s[0], s[1]*WEIGHTS["image_text"]) for s in image_text]
    image_content = [(s[0], s[1]*WEIGHTS["image_content"]) for s in image_content]
    
    # combine all of them
    subject.extend(body)
    subject.extend(image_text)
    subject.extend(image_content)
    
    # add the results
    d = defaultdict(float)

    for i in subject:
        d[i[0]] +=i[1]

    return sorted(d.items(), key=lambda t: t[1], reverse=True)

def categorize(subject, full_text, image_text, image_content):
    subject = find_matches(subject)
    full_text = find_matches(full_text)
    image_text = find_matches(image_text)
    image_content = find_matches(image_content)
    return calculate_result(subject, full_text, image_text, image_content)

def list_categories(res):
    total_points = sum([result[1] for result in results])
    cutoff = total_points * 0.1
    
    categories = []
    for category_index, value in res:
        if value >= 0 and total_points > cutoff:
            categories.append((categories_df.iloc[category_index]["full_category"], value))
            total_points = total_points - value
    return categories

In [83]:
df = pd.read_csv("./data/emails-tag-1100.csv", encoding="ISO-8859-1", dtype=str)
df["subject"] = df["subject"].apply(lambda x: clean_email_text(x))
df["full_text"] = df["full_text"].apply(lambda x: clean_email_text(x))
df = df.iloc[:10]

In [97]:
position = 3
subject = df.iloc[position]["subject"]
full_text = df.iloc[position]["full_text"]
guid = df.iloc[position]["guid"]
url = "https://s3.amazonaws.com/assets.mailcharts.com/emails/" + guid + "/index.html"
images = get_images_to_analyze(url)
image_text, image_content = get_images_data(images)

print("https://www.mailcharts.com/emails/" + df.iloc[position]["guid"])

https://www.mailcharts.com/emails/6ec4361a-cd9b-dd16-84cc-7257447f7071


In [98]:
results = categorize(subject, full_text, image_text, image_content)
list_categories(results)

[('shoe', 2.9000000000000004),
 ('clothing > outerwear', 1.8),
 ('clothing > pants > short', 0.95),
 ('clothing > shirt > top', 0.95),
 ('clothing', 0.9),
 ('clothing > outerwear > blazer', 0.6),
 ('clothing > outerwear > jacket', 0.6),
 ('accessory > bag', 0.5),
 ('accessory > scarf', 0.5),
 ('accessory > jewelry', 0.5),
 ('clothing > underwear > lingerie', 0.45),
 ('clothing > dress', 0.45),
 ('clothing > pants > jean', 0.45),
 ('clothing > pants > denim', 0.45),
 ('clothing > pants > short > swimwear', 0.45)]

In [None]:
# Ideas for improvement
# [ ] Look at google web results
# [X] Exclude values in the lowest N%
# [ ] Only return the lowest level for a given category
# [ ] Classify images by surface area and only look at top N% (e.g. 80%)

# Making this more data scien-cy
# Cluster companies based on similar content

In [None]:
import json
df = pd.read_csv("./data/emails-tag-1100.csv", encoding="ISO-8859-1", dtype=str)

In [150]:
position = 450
item = df.iloc[position]

url = "http://localhost:9000/api"
data = json.dumps({'subject':item["subject"],'full_text':item["full_text"], 'guid':item["guid"]}) 
r = requests.post(url, data)
print(r.json())

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [132]:
# Next steps
# Gather email data for 10 companies for last month
# jcrew, gap, uniqlo, banana republic, old navy, nike, coach, brooks brother
# Will work with Koyuki to cluster companies based on similar content