In [2]:
# Connect to the database
import sqlalchemy
from sqlalchemy import create_engine

engine = create_engine('postgresql+psycopg2://ctang:@localhost:5432/ctang', encoding='utf-8') # no password
connection = engine.connect()

from sqlalchemy import Table, Column, Enum, Boolean, Integer, Numeric, Text, Unicode, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, backref

Base = declarative_base()

class Aisle(Base):
    __tablename__ = 'aisles'
    
    aisle_id = Column('aisle_id', Integer(), autoincrement=True, primary_key=True)
    aisle = Column('aisle', Text)
    
    def __repr__(self):
        return "Aisle(aisle_id={self.aisle_id}, aisle={self.aisle})".format(self=self)

class Department(Base):
    __tablename__ = 'departments'
    
    department_id = Column('department_id', Integer(), autoincrement=True, primary_key=True)
    department = Column('department', Text)
    
    def __repr__(self):
        return "Department(department_id={self.department_id}, department={self.department})".format(self=self)
    
class Product(Base):
    __tablename__ = 'products'
    
    product_id = Column('product_id', Integer(), autoincrement=True, primary_key=True)
    product_name = Column('product_name', Text())
    aisle_id = Column(Integer(), ForeignKey('aisles.aisle_id'))
    department_id = Column(Integer(), ForeignKey('departments.department_id'))
    
    aisle = relationship('Aisle', backref=backref('products', order_by=product_id))
    department = relationship('Department', backref=backref('products', order_by=product_id))
    
    def __repr__(self):
        return u"Product(product_id={self.product_id}, " \
                "product_name={self.product_name}, " \
                "aisle_id={self.aisle_id}, " \
                "department_id={self.department_id})".format(self=self)

class Order(Base):
    __tablename__ = 'orders'
    
    order_id = Column('order_id', Integer(), autoincrement=True, primary_key=True)
    user_id = Column('user_id', Integer())
    order_eval_set = Column('order_eval_set', Enum('prior', 'train', 'test', name='order_eval_set'))
    order_number = Column('order_number', Integer())
    order_dow = Column('order_dow', Integer())
    order_hour_of_day = Column('order_hour_of_day', Integer())
    days_since_prior = Column('days_since_prior', Numeric(), nullable=True)

    def __repr__(self):
        return 'Order(order_id={self.order_id}, ' \
                'user_id={self.user_id}, ' \
                'order_eval_set={self.order_eval_set}, ' \
                'order_number={self.order_number}, ' \
                'order_dow={self.order_dow}, ' \
                'order_hour_of_day={self.order_hour_of_day}, ' \
                'days_since_prior={self.days_since_prior})'.format(self=self)

class LineItem(Base):
    __tablename__ = 'order_products__train'
    
    order_id = Column(Integer(), ForeignKey('orders.order_id'), primary_key=True)
    product_id = Column(Integer(), ForeignKey('products.product_id'), primary_key=True)
    add_to_cart_order = Column('add_to_cart_order', Integer())
    reordered = Column('reordered', Boolean())
    
    order = relationship('Order', backref=backref('line_items', order_by=add_to_cart_order))
    department = relationship('Product', backref=backref('line_items', order_by=add_to_cart_order))
    
    def __repr__(self):
        return 'LineItem(order_id={self.order_id}, ' \
                'product_id={self.product_id}, ' \
                'add_to_cart_order={self.add_to_cart_order}, ' \
                'reordered={self.reordered})'.format(self=self)

Base.metadata.create_all(engine)

In [45]:
# product 24 has a non-ASCII character in it. Print can't understand it.
# What's the alternative to print in ASCII?
product = session.query(Product).filter(Product.product_id == 24).first()
print product.__repr__().encode('utf-8')

Product(product_id=24, product_name=Tri-Vi-Sol® Vitamins A-C-and D Supplement Drops for Infants, aisle_id=47, department_id=11)


In [3]:
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)
session = Session()

from sqlalchemy.sql import func, desc, distinct

# Top 10 departments with the most product listings
for record in session.query(Department.department, func.count().label('product_count')) \
    .join(Product) \
    .group_by(Department.department) \
    .order_by(desc('product_count')) \
    .limit(10):
        print "%20s | %s" % (record.department, record.product_count)

       personal care | 6563
              snacks | 6264
              pantry | 5371
           beverages | 4365
              frozen | 4007
          dairy eggs | 3449
           household | 3085
        canned goods | 2092
     dry goods pasta | 1858
             produce | 1684


In [29]:
# Top N most ordered items
for record in session.query(Product.product_name, func.count().label('num_orders')) \
    .join(LineItem) \
    .group_by(Product.product_id) \
    .order_by(desc('num_orders')) \
    .limit(10):
        print "%50s | %s" % (record.product_name, record.num_orders)  

                                            Banana | 18726
                            Bag of Organic Bananas | 15480
                              Organic Strawberries | 10894
                              Organic Baby Spinach | 9784
                                       Large Lemon | 8135
                                   Organic Avocado | 7409
                              Organic Hass Avocado | 7293
                                      Strawberries | 6494
                                             Limes | 6033
                               Organic Raspberries | 5546


# Predicting Days Until Next Order Based on the Current Order

Let's build a machine learning model to predict $d = $ days until the next order given only the contents of the current order. We'll represent an order with a feature vector of 0s and 1s. The index of an entry in the vector will correspond to a `product_id`. If the entry is 0, the current order doesn't contain the product. If the entry is 1, the current order contains the product. For example, to represent an order by a customer for `1% Chocolate Milk` and `Kale Apple Greens`, the feature vector would look like

$
order =
\begin{bmatrix}
1 \\
0 \\
0 \\
\vdots \\
1 \\
\vdots \\
0 \\
\end{bmatrix}
\quad
\begin{array}{l}
\text{1% Chocolate Milk} \\
\text{1% Low Fat Cottage Cheese} \\
\text{Acacia Fiber Organic Powder} \\
\vdots \\
\text{Kale Apple Greens} \\
\vdots \\
\text{Zero Calorie Lemon Lime Twist Soda} \\
\end{array}
$

To represent the entire set of orders using for training our ML model, we'll use a $3421083 \times 49688$ matrix

$
orders_{train} =
\overbrace{
\begin{bmatrix}
(order_1)^T \\
(order_2)^T \\
(order_3)^T \\
\vdots \\
(order_{3,421,083})^T \\
\end{bmatrix}
}^{49,688}
$

There are 3,421,083 orders, hence the number of rows. There are 49,688 products, hence the number of columns (or features). Our goal is to learn the weights vector, $\theta \in \mathbb{R}^{49,688 \times 1}$. Each element of this vector is the weight that each product has in determing how many days later the next order will arrive.

Notice that $orders_{train}$ will consist mostly of 0s because the average order contains about 10 out of 49,688 products. This means $orders_{train}$ is a sparse matrix, and perhaps later we can explore taking advantage of this structure to reduce the number of computations to produce $\theta$.

Another thing to notice is that even though we have other information attached to an order such as day of the week, hour of the day, etc., this ML model will assume that those "features" don't matter. In the real world, they probably do, but for now, let's ignore them.

TODO: Write about the label vector, $y$.

Its cost function will be

$
\begin{align}
J(\theta) = \frac{1}{2} \sum_{i = 1}^m \left( h_\theta (x^{(i)}) - y^{(i)}  \right)^2
\end{align}
$

To build the training set, first, let's build one feature vector for an order with its corresponding label.

```
X_tr = orders_train = []
Y_tr = labels_train = []
for user in orders.user_id:
    if not user.has_more_than_one_order:
        continue
    for order, next_order in successive pairs of user.orders:
        if next_order is None:  # all orders except the latest one
            continue
        order_vector = vector of zeroes.
        for product_id in order.products:
            order_vector[product_id - 1] = 1 # use 0-based indexing, hence, subtract 1 from product_id for index
        label = next_order.days_since_prior
        X_tr.append(order_vector)
        Y_tr.append(label)
```

In [4]:
# num_users = session.query(Order.user_id).distinct().count() # 206,209

# order_by(Order.order_number) might not be necessary, but explicitly state it

def preprocess(tup):
    if tup[1] is None:
        second = -1
    else:
        second = int(tup[1])
    return tup[0], second

def n_user_ids(N):
    return [record[0] for record in session.query(distinct(Order.user_id)) \
            .filter(Order.order_eval_set == 'train') \
            .limit(N)]

def orders(user_id):
    order_info_for_user = session.query(Order.order_id, Order.days_since_prior) \
            .filter(Order.user_id == user_id) \
            .order_by(Order.order_number) \
            .all()
    return map(preprocess, order_info_for_user)

def order_products(order_id):
    return session.query(LineItem.product_id).filter(LineItem.order_id == order_id).all()

for user in n_user_ids(1):
    order_id_and_days_since_prior = orders(user)
    order_ids = [ tup[0] for tup in order_id_and_days_since_prior ]
    days_since_prior = [ tup[1] for tup in order_id_and_days_since_prior ]
    
    if len(order_ids) <= 1:
        continue
        
    for curr_oid, next_oid in zip(order_ids, order_ids[1:]):
        print curr_oid, next_oid
        print order_products(curr_oid)

# print order_products(1187899)
# for record in session.query(Order.user_id).distinct().limit(2):
#     order_info_for_user = orders(record.user_id)
#     for (idx, curr_order) in enumerate(order_info_for_user):
#         if (idx + 1 >= len(order_info_for_user)):
#             continue # curr_order is the user's latest order
#         next_order = order_info_for_user[idx + 1]
#         days_until_next_purchase = next_order[1]
#         for product in order.products:
#             # set feature vector
#     print '\n'

2539329 2398795
[]
2398795 473747
[]
473747 2254736
[]
2254736 431534
[]
431534 3367565
[]
3367565 550135
[]
550135 3108588
[]
3108588 2295261
[]
2295261 2550362
[]
2550362 1187899
[]


In [5]:
for record in session.query(distinct(Order.user_id)).filter(Order.order_eval_set == 'train').limit(10):
    print record

(1,)
(2,)
(5,)
(7,)
(8,)
(9,)
(10,)
(13,)
(14,)
(17,)
