In [1]:
import dill
import pandas as pd
import numpy

In [2]:
df=pd.read_csv("orders.csv", 
               parse_dates=[5],
              )

In [3]:
df.shape

(621212, 6)

In [4]:
df=df.dropna()

In [5]:
df.shape

(621211, 6)

In [6]:
df[["order_item_id", "num_items"]] = df[["order_item_id", "num_items"]].astype(int)

In [7]:
df.dtypes

customer_id                object
order_id                    int64
order_item_id               int32
num_items                   int32
revenue                   float64
created_at_date    datetime64[ns]
dtype: object

In [8]:
df[df.num_items.isna()][["order_id", "num_items"]]

Unnamed: 0,order_id,num_items


In [9]:
with open("model.dill", "rb") as f:
    model = dill.load(f)

In [10]:
model.predict(numpy.array([[3,92.6,109.3,2,12,26],[2,10.4,43.5,3,26,5]]))

array([ 244.9,   89.9])

In [11]:
sum([3,92.6,109.3,2,12,26])

244.89999999999998

In [12]:
import dis

In [13]:
dis.dis(model.predict)

  7           0 LOAD_GLOBAL              0 (isinstance)
              2 LOAD_FAST                1 (x)
              4 LOAD_GLOBAL              1 (numpy)
              6 LOAD_ATTR                2 (ndarray)
              8 LOAD_GLOBAL              1 (numpy)
             10 LOAD_ATTR                3 (generic)
             12 BUILD_TUPLE              2
             14 CALL_FUNCTION            2
             16 POP_JUMP_IF_TRUE        26

  8          18 LOAD_GLOBAL              4 (Exception)
             20 LOAD_CONST               1 ('Input must be a numpy array')
             22 CALL_FUNCTION            1
             24 RAISE_VARARGS            1

  9     >>   26 LOAD_FAST                1 (x)
             28 LOAD_ATTR                5 (shape)
             30 LOAD_CONST               2 (0)
             32 BINARY_SUBSCR
             34 LOAD_CONST               3 (1)
             36 COMPARE_OP               0 (<)
             38 POP_JUMP_IF_FALSE       48

 10          40 LOAD_GLOBAL  

In [14]:
max_items_per_customer = df.groupby("customer_id")["num_items"].max()

In [15]:
max_revenue_per_customer = df.groupby("customer_id")["revenue"].max()

In [16]:
total_revenue_per_customer = df.groupby("customer_id")["revenue"].sum()

In [17]:
total_number_of_orders_per_customer = df.groupby("customer_id")["order_id"].count()

In [18]:
last_order_date_per_customer = df.groupby("customer_id")["created_at_date"].max()

In [19]:
from datetime import datetime
current_date = datetime(2017,10,17)

In [20]:
days_since_last_order_per_customer = last_order_date_per_customer.apply(lambda x: (current_date - x).days)

In [21]:
order_dates_per_customer = df.groupby("customer_id")["created_at_date"].aggregate(lambda x: tuple(x))

In [22]:
def get_longest_interval(order_dates):
    if len(order_dates) <= 1:
        return numpy.nan

    intervals = []
    order_dates = sorted(order_dates)
    for i in range(len(order_dates) - 1):
        od1 = order_dates[i]
        od2 = order_dates[i + 1]
        interval = (od2 - od1).days
        intervals.append(interval)

    return max(intervals)

In [23]:
longest_interval_per_customer = order_dates_per_customer.apply(get_longest_interval)

In [24]:
result = pd.concat([
    max_items_per_customer, 
    max_revenue_per_customer, 
    total_revenue_per_customer, 
    total_number_of_orders_per_customer,
    days_since_last_order_per_customer,
    longest_interval_per_customer,
], axis=1, keys=["max_items", "max_rev", "total_rev", "total_num_orders", "days_since_last_order", "longest_interval"])

In [25]:
avg_longest_interval = result.longest_interval.mean()

In [26]:
pd.options.mode.chained_assignment = None

In [27]:
result.longest_interval[result.longest_interval.isna()] = result.days_since_last_order[result.longest_interval.isna()] + avg_longest_interval

In [28]:
result["predicted_clv"] = model.predict(result.values)

In [29]:
result.loc["000011265b8a3727c4cc77b494134aca"]["predicted_clv"]

124.78989459555484

In [30]:
result.to_csv("predicted_clv.csv")