In [1]:
import sys
import numpy as np

from utils.dataprocessor import DataProcessor
from utils.dataset import Dataset

from params import PREDICT_AMOUNT

In [2]:
def get_most_popular(df):
    return df["article_id"].value_counts()[:PREDICT_AMOUNT].keys()


def map_score(pred, gt):
    print("scoring")
    total_score = 0
    for u in range(len(pred)):
        score = 0.0
        num_hits = 0.0
        for i, p in enumerate(pred[u]):
            if p in gt[u] and p not in pred[u][:i]:
                num_hits += 1.0
                score += num_hits / (i + 1.0)
        total_score = total_score + (score / min(len(gt[u]), PREDICT_AMOUNT))
    return total_score / len(pred)


def get_test_gt(test_customer_ids, test_df):
    test_gts = []
    for customer_id in test_customer_ids:
        gt = test_df[test_df["customer_id"] == customer_id]["article_id"].values
        test_gts.append(gt)
    return test_gts


def evaluate_popular(most_popular, test_customer_ids, test_gts):
    pred = predict_popular(most_popular, test_customer_ids)
    score = map_score(pred, test_gts)
    print(f"score is {score}")


def predict_popular(most_popular_articles, customer_ids):
    return [most_popular_articles for _ in range(len(customer_ids))]

In [3]:
from params import OG_DATA_PATH, PROCESSED_DATA_PATH, PROCESSED_DATA_OUT_PATH

dataprocessor = DataProcessor(OG_DATA_PATH, PROCESSED_DATA_PATH, PROCESSED_DATA_OUT_PATH)

#%% get train/test_set

train_df = Dataset(dataprocessor.get_train_df())
test_df = Dataset(dataprocessor.get_test_df())
article_df = Dataset(dataprocessor.get_article_df())
customer_df = Dataset(dataprocessor.get_customer_df())

customer data fast load
article data fast load
transaction data fast load


In [4]:
train_customer_ids = train_df["customer_id"].unique()

In [5]:
#%% get most popular articles

test_customer_ids = test_df["customer_id"].unique()
test_gts = get_test_gt(test_customer_ids, test_df)

In [6]:
most_popular = get_most_popular(train_df)

In [7]:
# evaluate_popular(most_popular, test_customer_ids, test_gts)  # score = 0.003

In [8]:
# def find_last_purchase(customer_ids, train_df):
#     last_bought = np.empty((len(customer_ids)))
#     for i in range(len(customer_ids)):
#         cid = customer_ids[i]
#         c_df = train_df[train_df["customer_id"] == cid]
#         c_df = c_df.sort_values(by="t_dat", ascending=False)
#         last_bought[i] = c_df["article_id"].values[0]
#     return last_bought

# #%% get most similar article
# last_purchase_article_id = find_last_purchase(train_customer_ids, train_df)

In [15]:
import time

def get_similar_articles(aid, train_df, skip_customer_id, article_count=105542):
    # most popular articles among customers that bought article A
    start = time.time()
    customers = train_df[train_df["article_id"] == aid]["customer_id"].unique()
    print(len(customers))

    a_count = np.zeros((article_count))
    for cid in customers:
        # print(cid)
        # start_loop = time.time()
        if not cid == skip_customer_id:
            a_ids = train_df[train_df["customer_id"] == cid]["article_id"].unique()
            a_count[a_ids] = a_count[a_ids] + 1
        # print("loop time=", time.time()-start_loop)

    print("Total_t=", time.time()-start)
    return a_count


def get_similar_articles2(aid, train_df, skip_customer_id, article_count=105542):
    # most popular articles among customers that bought article A
    customers = train_df[train_df["article_id"] == aid]["customer_id"].unique()
    customers = customers[customers != skip_customer_id]
    transaction = train_df[train_df["customer_id"].isin(customers)]
    a_counts = transaction.groupby(["article_id"]).size()
    return a_counts

def predict_similar(train_df, customer_ids, most_popular, article_count=105542):
    preds = np.zeros((len(customer_ids), PREDICT_AMOUNT))
    for i in range(len(customer_ids)):
        print(f"{i}|{len(customer_ids)}")
        cid = customer_ids[i]
        aids = train_df[train_df["customer_id"] == cid]["article_id"].unique()
        if len(aids)>0:
            a_count = np.zeros((article_count))
            for aid in aids:
                # counts = get_similar_articles(aid, train_df,skip_customer_id=cid)
                counts = get_similar_articles2(aid, train_df,skip_customer_id=cid)
                a_count[counts.keys()] = a_count[counts.keys()] + counts.values

            ind = np.argpartition(-a_count, kth=PREDICT_AMOUNT)[:PREDICT_AMOUNT]
            preds[i] = ind
        else:
            preds[i] = most_popular
    return preds
        

def evaluate_similar(train_df, test_customer_ids, test_gts, most_popular):
    pred = predict_similar(train_df, test_customer_ids, most_popular)
    print(pred)
    score = map_score(pred, test_gts)
    print(f"score is {score}")


small_test_customer_ids = test_customer_ids[:50]
small_test_gts = test_gts[:50]
evaluate_similar(train_df, small_test_customer_ids, small_test_gts, most_popular)  # score = 0.0077

0|50
1|50
2|50
3|50
4|50
5|50
6|50
7|50
8|50
9|50
10|50
11|50
12|50
13|50
14|50
15|50
16|50
17|50
18|50
19|50
20|50
21|50
22|50
23|50
24|50
25|50
26|50
27|50
28|50
29|50
30|50
31|50
32|50
33|50
34|50
35|50
36|50
37|50
38|50
39|50
40|50
41|50
42|50
43|50
44|50
45|50
46|50
47|50
48|50
49|50
[[1.4690e+03 5.9458e+04 5.1504e+04 2.4559e+04 2.0850e+04 5.1505e+04
  5.3892e+04 3.5920e+03 4.7295e+04 2.2640e+04 8.9397e+04 3.7110e+03]
 [2.4836e+04 5.9100e+04 2.7438e+04 6.0342e+04 1.2744e+04 5.3892e+04
  4.5270e+04 6.0343e+04 2.4837e+04 1.2724e+04 3.2700e+02 8.7648e+04]
 [5.3893e+04 5.3892e+04 5.3894e+04 1.0285e+04 5.3896e+04 2.2360e+03
  1.4253e+04 1.4240e+04 5.3902e+04 1.7155e+04 1.2585e+04 3.0910e+03]
 [5.3892e+04 6.7522e+04 5.3893e+04 4.6351e+04 6.7051e+04 5.3894e+04
  1.4076e+04 2.4837e+04 5.3902e+04 6.8263e+04 5.7977e+04 2.2360e+03]
 [1.7044e+04 1.7043e+04 3.8616e+04 6.5545e+04 2.2340e+03 2.2250e+03
  5.3892e+04 5.3893e+04 2.2300e+03 2.4837e+04 2.4956e+04 1.3894e+04]
 [4.4440e+03 4.4497e+04 1