In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load Ali Data

In [3]:
dv = np.load('data/dv_count2.npy', allow_pickle=True)
print(dv.shape)

(9916, 97, 12)


In [4]:
dv

array([[['217617880515', '20230910', '70', ..., '14', '0', '46'],
        ['217617880515', '20230808', '0', ..., '0', '0', '0'],
        ['217617880515', '20230731', '0', ..., '0', '0', '0'],
        ...,
        ['217617880515', '20230629', '0', ..., '0', '0', '0'],
        ['217617880515', '20230730', '0', ..., '0', '0', '0'],
        ['217617880515', '20230712', '0', ..., '0', '0', '0']],

       [['218304415675', '20230617', '0', ..., '0', '0', '27'],
        ['218304415675', '20230729', '0', ..., '0', '0', '12'],
        ['218304415675', '20230907', '0', ..., '0', '0', '13'],
        ...,
        ['218304415675', '20230827', '0', ..., '0', '0', '27'],
        ['218304415675', '20230810', '0', ..., '0', '0', '21'],
        ['218304415675', '20230917', '1', ..., '0', '0', '40']],

       [['218598380456', '20230623', '0', ..., '0', '0', '4279'],
        ['218598380456', '20230728', '0', ..., '0', '0', '11767'],
        ['218598380456', '20230729', '3', ..., '2', '0', '10870'],
     

In [6]:
# accroding to 'consume_uv_1d' to define blockblusters
num_content = dv.shape[0]
x_list, y_list = [], []
for j in range(num_content):
    y = dv[j,:,:]
    fea = y[np.argsort(y[:,1])][:,2:]  # (time_step, fea_dim), remove 'content_id', 'visite_time'
    for day in range(30, 89):
        fea = fea.astype(np.float64)
        y_past = fea[day-3:day,-1]
        y_head = fea[day:day+3,-1]
        if np.sum(y_past)==0:
            y = 0
        else:
            y = np.sum(y_head)/np.sum(y_past)
        tr_fea = fea[day-30:day,:].T # -> (fea_dim, K)
        y_inc = 0 if y<=2.21 else 1
        x_list.append(tr_fea)
        y_list.append(y_inc)

x_list, y_list = np.array(x_list), np.array(y_list) # (585044, 10, 30), (585004, )

In [7]:
t1_idx, t0_idx = np.nonzero(y_list), np.nonzero(1-y_list) # explosive, not explosive
x1_list, x0_list = x_list[t1_idx], x_list[t0_idx]
print(x1_list.shape, x0_list.shape)

(16413, 10, 30) (568631, 10, 30)


## Analyze the Basic Patterns of Blockbuster

In [12]:
# 10 columns: ['click_uv_1d', 'consume_uv_1d_valid', 'favor_uv_1d', 'comment_uv_1d',
#           'share_uv_1d', 'collect_uv_1d', 'attention_uv_1d', 'lead_shop_uv_1d', 'cart_uv_1d', 'consume_uv_1d']

# sample averages (all samples in 30 day time window) y=1 vs. y=0
print("Mean of click_uv_1d:         y0: {:.3f} ({:.3f})  y1: {:.3f} ({:.3f})".format(np.mean(x0_list[:,0,:]), np.std(x0_list[:,0,:]), np.mean(x1_list[:,0,:]), np.std(x1_list[:,0,:])))
print("Mean of consume_uv_1d_valid: y0: {:.3f} ({:.3f})  y1: {:.3f} ({:.3f})".format(np.mean(x0_list[:,1,:]), np.std(x0_list[:,1,:]), np.mean(x1_list[:,1,:]), np.std(x1_list[:,1,:])))
print("Mean of favor_uv_1d:         y0: {:.3f} ({:.3f})  y1: {:.3f} ({:.3f})".format(np.mean(x0_list[:,2,:]), np.std(x0_list[:,2,:]), np.mean(x1_list[:,2,:]), np.std(x1_list[:,2,:])))
print("Mean of comment_uv_1d:       y0: {:.3f} ({:.3f})  y1: {:.3f} ({:.3f})".format(np.mean(x0_list[:,3,:]), np.std(x0_list[:,3,:]), np.mean(x1_list[:,3,:]), np.std(x1_list[:,3,:])))
print("Mean of share_uv_1d:         y0: {:.3f} ({:.3f})  y1: {:.3f} ({:.3f})".format(np.mean(x0_list[:,4,:]), np.std(x0_list[:,4,:]), np.mean(x1_list[:,4,:]), np.std(x1_list[:,4,:])))
print("Mean of collect_uv_1d:       y0: {:.3f} ({:.3f})  y1: {:.3f} ({:.3f})".format(np.mean(x0_list[:,5,:]), np.std(x0_list[:,5,:]), np.mean(x1_list[:,5,:]), np.std(x1_list[:,5,:])))
print("Mean of attention_uv_1d:     y0: {:.3f} ({:.3f})  y1: {:.3f} ({:.3f})".format(np.mean(x0_list[:,6,:]), np.std(x0_list[:,6,:]), np.mean(x1_list[:,6,:]), np.std(x1_list[:,6,:])))
print("Mean of lead_shop_uv_1d:     y0: {:.3f} ({:.3f})  y1: {:.3f} ({:.3f})".format(np.mean(x0_list[:,7,:]), np.std(x0_list[:,7,:]), np.mean(x1_list[:,7,:]), np.std(x1_list[:,7,:])))
print("Mean of cart_uv_1d:          y0: {:.3f} ({:.3f})  y1: {:.3f} ({:.3f})".format(np.mean(x0_list[:,8,:]), np.std(x0_list[:,8,:]), np.mean(x1_list[:,8,:]), np.std(x1_list[:,8,:])))
print("Mean of consume_uv_1d:       y0: {:.3f} ({:.3f})  y1: {:.3f} ({:.3f})".format(np.mean(x0_list[:,9,:]), np.std(x0_list[:,9,:]), np.mean(x1_list[:,9,:]), np.std(x1_list[:,9,:])))

# 发现：爆款的前30天，click, consume-valid, collect, attention, lead_shop, cart反而都更低
# 只有favor, comment, share高

Mean of click_uv_1d:         y0: 17.002 (94.132)  y1: 11.495 (123.418)
Mean of consume_uv_1d_valid: y0: 42.211 (534.310)  y1: 34.317 (631.915)
Mean of favor_uv_1d:         y0: 0.372 (11.304)  y1: 0.887 (18.965)
Mean of comment_uv_1d:       y0: 0.006 (0.164)  y1: 0.008 (0.302)
Mean of share_uv_1d:         y0: 0.055 (0.740)  y1: 0.056 (1.310)
Mean of collect_uv_1d:       y0: 0.138 (1.219)  y1: 0.126 (1.829)
Mean of attention_uv_1d:     y0: 0.063 (0.813)  y1: 0.055 (1.299)
Mean of lead_shop_uv_1d:     y0: 6.409 (28.884)  y1: 2.816 (13.789)
Mean of cart_uv_1d:          y0: 0.695 (3.233)  y1: 0.274 (1.635)
Mean of consume_uv_1d:       y0: 72.307 (1008.339)  y1: 61.410 (1176.040)
