In [1]:
# copy(select companies.name, emails.guid, emails.subject, emails.full_text from emails join companies on emails.company_id = companies.id join companies_tags on companies.id = companies_tags.company_id where companies_tags.tag_id = 1100 and emails.entity_state = 'original' order by emails.created_at desc limit 500) to '/tmp/emails.csv' with CSV HEADER;
# data: http://share.mailcharts.com/0L193K121x1l
# https://docs.google.com/spreadsheets/d/1XAYfOeRTxcebdt0ALm0ExsxhK0bodrx-bfEKA9ct7GE/edit#gid=0

import pandas as pd
import numpy as np
import math
import re
import time
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag as word_type
from collections import Counter
from collections import defaultdict
import string

In [2]:
def combine_categories(row):
    result = row.category_1.lower()
    
    if row.category_2:
        result = result + " > " + row.category_2.lower()
    if row.category_3:
        result = result + " > " + row.category_3.lower()
    if row.category_4:
        result = result + " > " + row.category_4.lower()
    if row.category_5:
        result = result + " > " + row.category_5.lower()
    if row.category_6:
        result = result + " > " + row.category_6.lower()
    if row.category_7:
        result = result + " > " + row.category_7.lower()

    return result

def get_last_category_val(row):    
    if row.lookup_value:
        return row.lookup_value.lower().split("|")
    if row.category_7:
        return [row.category_7.lower()]
    if row.category_6:
        return [row.category_6.lower()]
    if row.category_5:
        return [row.category_5.lower()]
    if row.category_4:
        return [row.category_4.lower()]
    if row.category_3:
        return [row.category_3.lower()]
    if row.category_2:
        return [row.category_2.lower()]
    if row.category_1:
        return [row.category_1.lower()]

In [3]:
# https://www.google.com/basepages/producttype/taxonomy.en-US.txt
cat = pd.read_csv("./data/taxonomy-carl.csv", encoding="ISO-8859-1", dtype=str)
cat.fillna(False, inplace=True)
cat["combined"] = cat.apply(lambda x: combine_categories(x), axis=1)
cat["lookup_value"] = cat.apply(lambda x: get_last_category_val(x), axis=1)
print(cat.shape)
cat.head()

(101, 9)


Unnamed: 0,category_1,category_2,category_3,category_4,category_5,category_6,category_7,lookup_value,combined
0,clothing,False,False,False,False,False,False,[clothing],clothing
1,clothing,outerwear,False,False,False,False,False,[outerwear],clothing > outerwear
2,clothing,outerwear,blazer,False,False,False,False,[blazer],clothing > outerwear > blazer
3,clothing,outerwear,coat,False,False,False,False,[coat],clothing > outerwear > coat
4,clothing,outerwear,rain coat,False,False,False,False,"[rain coat, rain jacket]",clothing > outerwear > rain coat


In [4]:
lemmatizer = WordNetLemmatizer()

# def clean_email_text(content):
#     lowered = str(content).lower()
#     singulars = " ".join([lemmatizer.lemmatize(word) for word in lowered.split(" ") if word.isalpha()])
#     nouns = " ".join([word[0] for word in word_type(singulars.strip().split(" ")) if word[1] != "VB"])
#     return nouns

punctuation = r"\+|\:|\{|\\|\(|\-|\`|\<|\?|\*|\;|\_|\@|\'|\[|\}|\)|\,|\/|\"|\$|\=|\&|\]|\!|\%|\>|\^|\~|\||\.|\#"

def clean_email_text(content):
    lowered = str(content).lower()
    no_punct = re.sub(punctuation, "", lowered)
    alpha = " ".join([word for word in no_punct.split(" ") if word.isalpha()])
    nouns = " ".join([word[0] for word in word_type(alpha.strip().split(" ")) if word[1] != "VB"])
    singulars = " ".join([lemmatizer.lemmatize(word) for word in nouns.split(" ") if word.isalpha()])
    return singulars

In [5]:
df = pd.read_csv("./data/emails-tag-1100.csv", encoding="ISO-8859-1", dtype=str)
df["all_text"] = df["subject"] + " " + df["full_text"]
df["all_text"] = df["all_text"].apply(lambda x: clean_email_text(x))
df = df.iloc[:50]
print(df.shape)
df.head()

(50, 5)


Unnamed: 0,name,guid,subject,full_text,all_text
0,Secret Sales UK,8caef803-009b-7751-312c-52f8d0181565,Beat The January Blues with up to 60% off Seve...,"Plus: John & Yoko Shearling, Vintage Louis Vui...",the january blue with up to off puma basket he...
1,Fashion Eyewear,3b9666be-1ddc-952f-329e-6e832911892e,January Sale Final Weekend | Up to 30% OFF!,Get 20% OFF using code FRESH20 No Images? Clic...,january sale final weekend up to off off using...
2,Betabrand,0e24b7c3-1971-bd90-e271-2050cd50159c,Sasquatch Conquers All!,"[A new shirt celebrating nature's most brutal,...",sasquatch conquers all a new shirt celebrating...
3,Free People,6ec4361a-cd9b-dd16-84cc-7257447f7071,WANTEDð: That Femme Outfit,Whatâs New at Free People Sweet-meets-sensua...,that femme outfit new at free people sweetmeet...
4,BOOM by Cindy Joseph,bee78052-5639-435c-87c4-38ca892ff505,âI felt liberated. ____was my first step to ...,BOOM! by Cindy Joseph: It's about women. It's ...,felt liberated wa my first step to feeling boo...


In [6]:
def contains_word(s, w):
    return (' ' + w + ' ') in (' ' + s + ' ')

all_text = df.all_text

categories = []

for text in all_text:
    category = set()
    for i, row in cat.iterrows():
        category_to_find = row["lookup_value"]
        for c in category_to_find:
            if contains_word(text, c):
                category.add(i)
    categories.append(list(category))

df["category_ids"] = categories

In [7]:
def convert_categories_index_to_text(categories):
    result = []
    if len(categories) == 0:
        return []
    for i in categories:
        result.append(cat.iloc[i]["combined"])
        result.append("*****")
    return result

df["categories"] = df["category_ids"].apply(lambda x: convert_categories_index_to_text(x))


In [8]:
df

Unnamed: 0,name,guid,subject,full_text,all_text,category_ids,categories
0,Secret Sales UK,8caef803-009b-7751-312c-52f8d0181565,Beat The January Blues with up to 60% off Seve...,"Plus: John & Yoko Shearling, Vintage Louis Vui...",the january blue with up to off puma basket he...,"[64, 65, 36, 41, 73, 74, 50, 53, 55, 28]","[accessory, *****, accessory > bag, *****, clo..."
1,Fashion Eyewear,3b9666be-1ddc-952f-329e-6e832911892e,January Sale Final Weekend | Up to 30% OFF!,Get 20% OFF using code FRESH20 No Images? Clic...,january sale final weekend up to off off using...,[78],"[accessory > sunglass, *****]"
2,Betabrand,0e24b7c3-1971-bd90-e271-2050cd50159c,Sasquatch Conquers All!,"[A new shirt celebrating nature's most brutal,...",sasquatch conquers all a new shirt celebrating...,"[65, 38, 8, 74, 51, 22]","[accessory > bag, *****, clothing > underwear ..."
3,Free People,6ec4361a-cd9b-dd16-84cc-7257447f7071,WANTEDð: That Femme Outfit,Whatâs New at Free People Sweet-meets-sensua...,that femme outfit new at free people sweetmeet...,"[8, 2, 50]","[clothing > outerwear > jacket, *****, clothin..."
4,BOOM by Cindy Joseph,bee78052-5639-435c-87c4-38ca892ff505,âI felt liberated. ____was my first step to ...,BOOM! by Cindy Joseph: It's about women. It's ...,felt liberated wa my first step to feeling boo...,[74],"[accessory > watch, *****]"
5,Anthropologie,aab73866-8b24-58e1-7f5b-c50ea44476c5,How to wear indigo.,We're loving the blues. ----------------------...,how to indigo were loving the blue were loving...,"[64, 65, 8, 43, 12, 80, 50, 28, 61]","[accessory, *****, accessory > bag, *****, clo..."
6,Bluefly,0e448959-a6f9-a7cb-cd6e-7e565b9313ad,(Hint: Diamonds On Sale) Up To 75% Off Fine Je...,"Plus, Extra 20% Off Dresses | View On Web Brow...",hint diamond on sale up to off fine jewelry pl...,"[64, 65, 41, 50, 87, 88, 89, 90, 91, 60]","[accessory, *****, accessory > bag, *****, clo..."
7,Endource Limited,0f9420b2-4d5a-2233-1009-845f29eb3b78,A shortcut to Parisian chic,| Bring the fitting room to your home | 663399...,a shortcut to parisian chic bring the fitting ...,[],[]
8,LE CHÃTEAU,90c770fa-88fc-cecc-2c38-774f9e1166fc,"Flash Sale: 1,500 styles at $50 or less!",Seriously. Low. Prices. View in browser Free S...,flash sale style at or le seriously low price ...,"[0, 50, 64, 60]","[clothing, *****, shoe, *****, accessory, ****..."
9,Alex Monroe,407155a8-b8e2-b1d0-c346-32736b50603a,Final Weekend of our Archive Sale! | Alex talk...,It's the final days of our Archive Sale and a ...,final weekend of our archive sale alex talk su...,"[89, 74, 90]","[accessory > jewelry > earring, *****, accesso..."


In [9]:
# QA
## Consider excluding "flat rate"

position = 6

print("https://www.mailcharts.com/emails/" + df.iloc[position]["guid"])
print("========")
print(df.iloc[position]["all_text"])
print("========")
print(df.iloc[position]["categories"])

https://www.mailcharts.com/emails/0e448959-a6f9-a7cb-cd6e-7e565b9313ad
hint diamond on sale up to off fine jewelry plus extra off dress view on web browser bluefly shipping woman men shoe bag accessory jewelry kid home clearance fine jewelry style spotlight shop now ring shop now earring shop now necklace shop now bracelet shop now dress sale extra off shop now facebook twitter pinterest instagram polyvore google customer service track my order faq fine jewelry spotlight dress sale end at am est on january discount is reflected in pricing and excludes out of stock item additional exclusion apply select style a marked on product page this item doe not for promotion reduced price cannot applied retroactively to purchase made prior to the start of this sale or to purchase made after the end of this sale affirm payment subject to credit check and approval down payment may required affirm is only available for cart worth affirm loan are made by cross river bank a new jersey state chartered 

In [10]:
def contains_word(s, w):
    return (' ' + w + ' ') in (' ' + s + ' ')

def contains_word_with_counter(i, s, w):
    return (i, s.count(w))


def find_matches(text):
    text = clean_email_text(text)
    categories = []
    
    for i, row in cat.iterrows():
        category_to_find = row["lookup_value"]
        for c in category_to_find:
            if contains_word(text, c):
                categories.append(contains_word_with_counter(i, text, c))
    return categories


In [11]:
WEIGHTS = {
    "subject": 0.66,
    "body": 0.34
}

subject = find_matches("Buy these great boxers! And don't forget to buy this top ring. The ring is great!")
body = find_matches("skirt, skirt, skirt! ring :)")
print(subject)
print(body)

def calculate_result(subject, body):    
    # Calculate weight
    subject = [(s[0], s[1]*WEIGHTS["subject"]) for s in subject]
    body = [(s[0], s[1]*WEIGHTS["body"]) for s in body]
    
    # combine the two
    subject.extend(body)
    print(subject)
    
    # add the results
    d = defaultdict(float)

    for i in subject:
        d[i[0]] +=i[1]

    return sorted(d.items(), key=lambda t: t[1], reverse=True)

    
results = calculate_result(subject, body)

def get_categories(res):
    categories = []
    for category_index, value in res:
        if value >= 1:
            categories.append((cat.iloc[category_index]["combined"], value))
    return categories
        

get_categories(results)



[(28, 1), (34, 1), (91, 2)]
[(42, 3), (91, 1)]
[(28, 0.66), (34, 0.66), (91, 1.32), (42, 1.02), (91, 0.34)]


[('accessory > jewelry > ring', 1.6600000000000001),
 ('clothing > skirt', 1.02)]

In [23]:
# OCR from file

from PIL import Image
import pytesseract
import argparse
import os


GUID = "b93a703428830a4a6bf263604b7c4db7098fd8b2.jpg"
text = pytesseract.image_to_string(Image.open("/Users/carl/sites/thinkful-data-science/coursework/final_capstone/data/images/" + GUID))
print(text)

$79-$89
DAILY LEATHER
ZIP POUCH )


In [26]:
# OCR from URL
# Note: We'll need to convert animated gifs to pngs or jpegs

import requests
from io import BytesIO

response = requests.get('https://s3.amazonaws.com/assets.mailcharts.com/emails/3b9666be-1ddc-952f-329e-6e832911892e/images/5bb20f551499ca223af0c14e1cc36a984208abb3.png')
img = Image.open(BytesIO(response.content))
text = pytesseract.image_to_string(img)
print(text)

FINAL WEEKEND!

MAKE THAT FIRST PURCHASE OF 2018 A GOOD ONE


In [110]:
# next steps: weight things
### inception v3 from google to classify images (or use google vision)
####### object tagging
####### something similar for OCR that is pre-trained
### implement in production
## CARL TODO
# Clean up code
# Add blacklist (e.g. "flat rate")
# Continue building the keyword list
# Look into next steps


# using google vision to understand the email better (image recognition, OCR, etc)
### use this info to add new categories
# explore using different weights (would this make sense for us?)

In [254]:
# import string
# from nltk.corpus import words
# from nltk.corpus import stopwords
# from nltk.corpus import wordnet
# from nltk.tokenize import word_tokenize

# # Get stopwords_set and punctuation set.
# stopwords = set(stopwords.words('english'))
# punctuation = set(string.punctuation)

# all_words_in_subjects = []

# for i, row in df.iterrows():
#     word_tokens = word_tokenize(row.subject)
#     filtered_sentence = [w.lower() for w in word_tokens if not w in stopwords and not w in punctuation and w.isalpha()]
#     all_words_in_subjects.extend(filtered_sentence)
    
#     word_tokens = word_tokenize(row.full_text)
#     filtered_sentence = [w.lower() for w in word_tokens if not w in stopwords and not w in punctuation and w.isalpha()]
#     all_words_in_subjects.extend(filtered_sentence)

In [12]:
# Playing with Google Vision API
# https://google-cloud-python.readthedocs.io/en/latest/vision/index.html
# https://google-cloud-python.readthedocs.io/en/latest/vision/gapic/v1/api.html
# https://cloud.google.com/vision/docs/libraries
# https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1347

from google.cloud.vision_v1 import ImageAnnotatorClient

# Instantiates a client
client = ImageAnnotatorClient(credentials="/Users/carl/sites/google_creds.json")

image_url = "https://dha4w82d62smt.cloudfront.net/items/2i1h33230w0C0U3Q1E44/demo-image.jpg?X-CloudApp-Visitor-Id=1951556&v=23a7e102"

request = {'image':{'source':{'image_uri':image_url}}}

response = client.annotate_image(request)
# len(response.annotations)

# for face in response.annotations[0].faces:
#     print(face.joy)

# for logo in response.annotations[0].logos:
#     print(logo.description)

ERROR:root:AuthMetadataPluginCallback "<google.auth.transport.grpc.AuthMetadataPlugin object at 0x1265b91d0>" raised exception!
Traceback (most recent call last):
  File "//anaconda/lib/python3.5/site-packages/grpc/_plugin_wrapping.py", line 77, in __call__
    callback_state, callback))
  File "//anaconda/lib/python3.5/site-packages/google/auth/transport/grpc.py", line 77, in __call__
    callback(self._get_authorization_headers(context), None)
  File "//anaconda/lib/python3.5/site-packages/google/auth/transport/grpc.py", line 61, in _get_authorization_headers
    self._credentials.before_request(
AttributeError: 'str' object has no attribute 'before_request'


ServiceUnavailable: 503 Getting metadata from plugin failed with error: 'str' object has no attribute 'before_request'