In [237]:
# copy(select companies.name, emails.guid, emails.subject, emails.full_text from emails join companies on emails.company_id = companies.id join companies_tags on companies.id = companies_tags.company_id where companies_tags.tag_id = 1100 and emails.entity_state = 'original' order by emails.created_at desc limit 500) to '/tmp/emails.csv' with CSV HEADER;
# data: http://share.mailcharts.com/0L193K121x1l
# https://docs.google.com/spreadsheets/d/1XAYfOeRTxcebdt0ALm0ExsxhK0bodrx-bfEKA9ct7GE/edit#gid=0

import pandas as pd
import numpy as np
import math
import re
import time
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag as word_type

In [238]:
def combine_categories(row):
    result = row.category_1.lower()
    
    if row.category_2:
        result = result + " > " + row.category_2.lower()
    if row.category_3:
        result = result + " > " + row.category_3.lower()
    if row.category_4:
        result = result + " > " + row.category_4.lower()
    if row.category_5:
        result = result + " > " + row.category_5.lower()
    if row.category_6:
        result = result + " > " + row.category_6.lower()
    if row.category_7:
        result = result + " > " + row.category_7.lower()

    return result

def get_last_category_val(row):    
    if row.lookup_value:
        return row.lookup_value.lower().split("|")
    if row.category_7:
        return [row.category_7.lower()]
    if row.category_6:
        return [row.category_6.lower()]
    if row.category_5:
        return [row.category_5.lower()]
    if row.category_4:
        return [row.category_4.lower()]
    if row.category_3:
        return [row.category_3.lower()]
    if row.category_2:
        return [row.category_2.lower()]
    if row.category_1:
        return [row.category_1.lower()]

In [239]:
# https://www.google.com/basepages/producttype/taxonomy.en-US.txt
cat = pd.read_csv("./data/taxonomy-carl.csv", encoding="ISO-8859-1", dtype=str)
cat.fillna(False, inplace=True)
cat["combined"] = cat.apply(lambda x: combine_categories(x), axis=1)
cat["lookup_value"] = cat.apply(lambda x: get_last_category_val(x), axis=1)
print(cat.shape)
cat.head()

(101, 9)


Unnamed: 0,category_1,category_2,category_3,category_4,category_5,category_6,category_7,lookup_value,combined
0,clothing,False,False,False,False,False,False,[clothing],clothing
1,clothing,outerwear,False,False,False,False,False,[outerwear],clothing > outerwear
2,clothing,outerwear,blazer,False,False,False,False,[blazer],clothing > outerwear > blazer
3,clothing,outerwear,coat,False,False,False,False,[coat],clothing > outerwear > coat
4,clothing,outerwear,rain coat,False,False,False,False,"[rain coat, rain jacket]",clothing > outerwear > rain coat


In [248]:
lemmatizer = WordNetLemmatizer()

# def clean_email_text(content):
#     lowered = str(content).lower()
#     singulars = " ".join([lemmatizer.lemmatize(word) for word in lowered.split(" ") if word.isalpha()])
#     nouns = " ".join([word[0] for word in word_type(singulars.strip().split(" ")) if word[1] != "VB"])
#     return nouns

def clean_email_text(content):
    lowered = str(content).lower()
    alpha = " ".join([word for word in lowered.split(" ") if word.isalpha()])
    nouns = " ".join([word[0] for word in word_type(alpha.strip().split(" ")) if word[1] != "VB"])
    singulars = " ".join([lemmatizer.lemmatize(word) for word in nouns.split(" ") if word.isalpha()])
    return singulars

In [249]:
df = pd.read_csv("./data/emails-tag-1100.csv", encoding="ISO-8859-1", dtype=str)
df["all_text"] = df["subject"] + " " + df["full_text"]
df["all_text"] = df["all_text"].apply(lambda x: clean_email_text(x))
df = df.iloc[:50]
print(df.shape)
df.head()

(50, 5)


Unnamed: 0,name,guid,subject,full_text,all_text
0,Secret Sales UK,8caef803-009b-7751-312c-52f8d0181565,Beat The January Blues with up to 60% off Seve...,"Plus: John & Yoko Shearling, Vintage Louis Vui...",the january blue with up to off puma basket as...
1,Fashion Eyewear,3b9666be-1ddc-952f-329e-6e832911892e,January Sale Final Weekend | Up to 30% OFF!,Get 20% OFF using code FRESH20 No Images? Clic...,january sale final weekend up to off using cod...
2,Betabrand,0e24b7c3-1971-bd90-e271-2050cd50159c,Sasquatch Conquers All!,"[A new shirt celebrating nature's most brutal,...",sasquatch conquers new shirt celebrating most ...
3,Free People,6ec4361a-cd9b-dd16-84cc-7257447f7071,WANTEDð: That Femme Outfit,Whatâs New at Free People Sweet-meets-sensua...,that femme outfit new at free people view in a...
4,BOOM by Cindy Joseph,bee78052-5639-435c-87c4-38ca892ff505,âI felt liberated. ____was my first step to ...,BOOM! by Cindy Joseph: It's about women. It's ...,my first step to feeling by cindy about about ...


In [250]:
def contains_word(s, w):
    return (' ' + w + ' ') in (' ' + s + ' ')

all_text = df.all_text.apply(lambda x: x.lower())

start_time = time.time()
categories = []

for text in all_text:
    category = set()
    for i, row in cat.iterrows():
        category_to_find = row["lookup_value"]
        for c in category_to_find:
            if contains_word(text, c):
                category.add(i)
    categories.append(list(category))

df["category_ids"] = categories

In [251]:
def convert_categories_index_to_text(categories):
    result = []
    if len(categories) == 0:
        return []
    for i in categories:
        result.append(cat.iloc[i]["combined"])
        result.append("*****")
    return result

df["categories"] = df["category_ids"].apply(lambda x: convert_categories_index_to_text(x))


In [252]:
df

Unnamed: 0,name,guid,subject,full_text,all_text,category_ids,categories
0,Secret Sales UK,8caef803-009b-7751-312c-52f8d0181565,Beat The January Blues with up to 60% off Seve...,"Plus: John & Yoko Shearling, Vintage Louis Vui...",the january blue with up to off puma basket as...,"[64, 65, 36, 41, 73, 74, 53, 28]","[accessory, *****, accessory > bag, *****, clo..."
1,Fashion Eyewear,3b9666be-1ddc-952f-329e-6e832911892e,January Sale Final Weekend | Up to 30% OFF!,Get 20% OFF using code FRESH20 No Images? Clic...,january sale final weekend up to off using cod...,[78],"[accessory > sunglass, *****]"
2,Betabrand,0e24b7c3-1971-bd90-e271-2050cd50159c,Sasquatch Conquers All!,"[A new shirt celebrating nature's most brutal,...",sasquatch conquers new shirt celebrating most ...,"[65, 38, 8, 74, 51, 22]","[accessory > bag, *****, clothing > underwear ..."
3,Free People,6ec4361a-cd9b-dd16-84cc-7257447f7071,WANTEDð: That Femme Outfit,Whatâs New at Free People Sweet-meets-sensua...,that femme outfit new at free people view in a...,"[2, 50]","[clothing > outerwear > blazer, *****, shoe, *..."
4,BOOM by Cindy Joseph,bee78052-5639-435c-87c4-38ca892ff505,âI felt liberated. ____was my first step to ...,BOOM! by Cindy Joseph: It's about women. It's ...,my first step to feeling by cindy about about ...,[74],"[accessory > watch, *****]"
5,Anthropologie,aab73866-8b24-58e1-7f5b-c50ea44476c5,How to wear indigo.,We're loving the blues. ----------------------...,how to loving the loving the shop new arrival ...,"[50, 12]","[shoe, *****, clothing > pants > denim, *****]"
6,Bluefly,0e448959-a6f9-a7cb-cd6e-7e565b9313ad,(Hint: Diamonds On Sale) Up To 75% Off Fine Je...,"Plus, Extra 20% Off Dresses | View On Web Brow...",diamond on up to off fine extra off dress view...,"[64, 65, 41, 50, 87, 60]","[accessory, *****, accessory > bag, *****, clo..."
7,Endource Limited,0f9420b2-4d5a-2233-1009-845f29eb3b78,A shortcut to Parisian chic,| Bring the fitting room to your home | 663399...,a shortcut to parisian chic bring the fitting ...,[],[]
8,LE CHÃTEAU,90c770fa-88fc-cecc-2c38-774f9e1166fc,"Flash Sale: 1,500 styles at $50 or less!",Seriously. Low. Prices. View in browser Free S...,flash style at or view in browser free shippin...,"[64, 50, 60]","[accessory, *****, shoe, *****, shoe > flat, *..."
9,Alex Monroe,407155a8-b8e2-b1d0-c346-32736b50603a,Final Weekend of our Archive Sale! | Alex talk...,It's the final days of our Archive Sale and a ...,final weekend of our archive alex talk of the ...,"[89, 74, 90]","[accessory > jewelry > earring, *****, accesso..."


In [257]:
# QA
## URL: https://www.mailcharts.com/emails/_GUID_
## Wrong (from file #1: 'emails.csv'): 11, 14, 15, 24, 25

position = 6

print("https://www.mailcharts.com/emails/" + df.iloc[position]["guid"])
print("========")
print(df.iloc[position]["all_text"])
print("========")
print(df.iloc[position]["categories"])

https://www.mailcharts.com/emails/0e448959-a6f9-a7cb-cd6e-7e565b9313ad
diamond on up to off fine extra off dress view on web browser shipping woman men shoe bag accessory jewelry kid home clearance fine style shop now shop now shop now shop now shop now dress extra shop now facebook twitter pinterest instagram polyvore customer service track my order faq jewelry spotlight dress sale end at am est on january discount is reflected in pricing and excludes out of stock additional exclusion select style a marked on product page item doe not for reduced price cannot applied retroactively to purchase made prior to the start of this sale or to purchase made after the end of this payment subject to credit check and down payment may affirm is only available for cart worth affirm loan are made by cross river a new jersey state chartered commercial member see for flat rate shipping fee of will automatically applied at no order minimum international shipping and previous order express and expedited

In [110]:
# next steps: weight things
### how can we weight things (e.g. subject weights more than body)
### items lower on the page are woth less (or if it's a navigation or a one-off mention)
### how can we get a score (and remove anything below X score)


# using google vision to understand the email better (image recognition, OCR, etc)
### use this info to add new categories
# explore using different weights (would this make sense for us?)

In [254]:
# import string
# from nltk.corpus import words
# from nltk.corpus import stopwords
# from nltk.corpus import wordnet
# from nltk.tokenize import word_tokenize

# # Get stopwords_set and punctuation set.
# stopwords = set(stopwords.words('english'))
# punctuation = set(string.punctuation)

# all_words_in_subjects = []

# for i, row in df.iterrows():
#     word_tokens = word_tokenize(row.subject)
#     filtered_sentence = [w.lower() for w in word_tokens if not w in stopwords and not w in punctuation and w.isalpha()]
#     all_words_in_subjects.extend(filtered_sentence)
    
#     word_tokens = word_tokenize(row.full_text)
#     filtered_sentence = [w.lower() for w in word_tokens if not w in stopwords and not w in punctuation and w.isalpha()]
#     all_words_in_subjects.extend(filtered_sentence)

In [259]:
# Playing with Google Vision API

import io
import os

# Imports the Google Cloud client library
from google.cloud import vision
from google.cloud.vision import types

# Instantiates a client
client = vision.ImageAnnotatorClient()

# # The name of the image file to annotate
# file_name = os.path.join(
#     os.path.dirname(__file__),
#     '/Users/carl/Desktop/demo-image.jpg')

# # Loads the image into memory
# with io.open(file_name, 'rb') as image_file:
#     content = image_file.read()

# image = types.Image(content=content)

# # Performs label detection on the image file
# response = client.label_detection(image=image)
# labels = response.label_annotations

# print('Labels:')
# for label in labels:
#     print(label.description)

ImportError: No module named 'google'