In [241]:
# Data: http://share.mailcharts.com/0P092V2m0V2x
# Headers: id, company name, company id, subject, sent at, discounts

# Goal 1: When will the next email be?
# -- in [n] seconds
# Goal 2: Will the next email be a discount?
# -- True || False
# Goal 3: How much will they discount by?

In [269]:
from datetime import date, datetime
import pandas as pd
import numpy as np
import re
import scipy

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

In [262]:
df = pd.read_csv('./data/capstone.csv')

In [270]:
# Add features

def determine_weekday(n):
    if n == 5 or n == 6:
        # weekend
        return 0
    else:
        #weekday
        return 1
    
def determine_am_pm(date):
    return 0 if date.strftime('%p') == "AM" else 1
    
def create_datetime(row):
    return datetime(row["year"],row["month"],row["day"])

def construct_full_date(timestamp):
    # 2017-01-01 00:01:32
    # format: year, month, day, hour, minute, seconds
    date = re.split(" |\-|\:", timestamp)
    _date = [int(x) for x in date]
    return datetime(year=_date[0], month=_date[1], day=_date[2], hour=_date[3], minute=_date[4], second=_date[5])

df["promo_not_promo"] = np.where(df["promotion_type"] == "[null]", "0", "1")
df["month"] = df["sent_at"].str.split('-').str.get(1).astype(int)
df["year"] = df["sent_at"].str.split('-').str.get(0).astype(int)
df["day"] = df["sent_at"].str.split('-').str.get(2).str.split(' ').str.get(0).astype(int)
df["weeknum"] = df.apply(lambda x: create_datetime(x).isocalendar()[1], axis=1)
df["am_pm"] = df.apply(lambda x: determine_am_pm(create_datetime(x)), axis=1)
df["weekday_weekend"] = df.apply(lambda x: determine_weekday(create_datetime(x).weekday()), axis=1)

In [271]:
# Get company dummies (we will use this later)

dummy_company = pd.get_dummies(df["company_name"])
dummy_company.head()

Unnamed: 0,Abercrombie & Fitch,Ace & Everett,Active Ride Shop,Adorama,Altrec.com,Amazon.co.uk,Ames Walker,Andrew Christian,Ann Taylor,Annie Selke,...,Thomas Pink,Timothy's Café,Tommy Bahama,Treatsie,Variety,White Stuff,Wine Library,WoolOvers,ZALORA,boohoo
99,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [265]:
# Gather when the previous email was sent

df = df.sort_values("sent_at", ascending=False)
results = []
for index, row in df.iterrows():
    company_emails = df[df["company_name"] == row["company_name"]]
    if np.where(company_emails["sent_at"] < row["sent_at"])[0] > 0:
        for i, r in company_emails.iterrows():
            if r["sent_at"] < row["sent_at"]:
                results.append(r["sent_at"])
                continue
    else:
        results.append(None)

# Save the results to our dataframe
df["prev_email"] = results

In [267]:
# Get the time delta with the last email

def get_time_delta(a, b):
    if b == None:
        return 0
    else:
        return (construct_full_date(a) - construct_full_date(b)).seconds
    
df["prev_email_delta"] = df.apply(lambda x: get_time_delta(x["sent_at"], x["prev_email"]), axis=1)
df

Unnamed: 0,id,company_name,company_id,subject,sent_at,promotion_type,promotion_value,promo_not_promo,month,year,day,weeknum,am_pm,weekday_weekend,prev_email,prev_email_delta
99,1276920,Lethal Performance,4895,Happy New Year from Lethal! Here's a sale...,2017-01-01 08:31:27,[null],[null],0,1,2017,1,52,0,0,,0
98,1276916,Hawes & Curtis,1936,Happy New Year from Hawes & Curtis!,2017-01-01 08:28:34,[null],[null],0,1,2017,1,52,0,0,,0
97,1276914,Original Parts Group,4931,Happy New Year! Free Freight on Everything,2017-01-01 08:25:39,"[""discount_percentage""]",[30],1,1,2017,1,52,0,0,,0
96,1276911,"Julian Whitaker, MD",5836,HAPPY NEW YEAR! There is time to save on our Y...,2017-01-01 08:18:42,"[""discount_percentage""]",[60],1,1,2017,1,52,0,0,,0
95,1276910,Thomas Pink,1942,Whatever your work,2017-01-01 08:16:20,[null],[null],0,1,2017,1,52,0,0,,0
94,1276909,GO Outdoors,1875,Happy New Year! Sale continues - 1000’s of lin...,2017-01-01 08:12:53,"[""discount_percentage""]",[60],1,1,2017,1,52,0,0,,0
93,1276908,Clarks Shoes,1913,Boots for 2017 adventures + SALE continues,2017-01-01 08:12:39,"[""discount_percentage""]",[50],1,1,2017,1,52,0,0,,0
92,1276906,Booster Juice,5729,A Birthday Smoothie,2017-01-01 08:05:42,[null],[null],0,1,2017,1,52,0,0,,0
91,1276904,Farfetch US,5123,Activewear | Start 2017 the right way...,2017-01-01 08:04:57,[null],[null],0,1,2017,1,52,0,0,,0
90,1276903,Dillard's,1849,New Year's Day : Take an add'l 50% off,2017-01-01 08:04:26,"[""discount_percentage""]",[50],1,1,2017,1,52,0,0,,0


In [274]:
# pd.concat([df, dummy_company])

## Homework

- [x] Get time delta
- [x] Merge timedelta into on DF
- [ ] Create a test dataset with n=10,000
- [x] Think about what models we want to run through this

## Models to use
**Question: When will the next email be?**

- Linear regression

**Will the next email be a discount?**

- Random forrest (DecisionTreeClassifier)

**How much will they discount by?**

- SVM