# Auction
* 500 Buyers
* \# Painint Train / Test: 2000 / 1000
* Choose top-10
* Evaluation Metric: Rate of choosing the top-K (Recall)


# Problems
* Negative values in price????????????????????????????????????????????
* Number of testing paining are NOT 1000 ????????????????????????????????????????????

In [1]:
NUM_BUYER = 500
NUM_TRAIN = 2000
NUM_TEST = 1000 - 1

import pandas
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, NMF
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture


def read_price(f_name):    
    with open(f_name, 'r') as f:
        data = np.array([list(map(float, line.strip().split(' '))) for line in f])
    assert len(data) == NUM_BUYER
    assert data.shape[1] == (NUM_TRAIN if 'train' in f_name else NUM_TEST)
    return data

    
def read_category(f_name):    
    with open(f_name, 'r') as f:
        data = np.array([int(e.strip()) for e in f.readlines()])
    assert len(data) == NUM_TRAIN if 'train' in f_name else NUM_TEST
    return data


def scale_to_origin(train, test):
    grand_min = min(np.min(train), np.min(test))
    if grand_min >= 0:
        print(f"Min price: {grand_min} is already above 0")
    train -= grand_min
    test -= grand_min
    assert min(np.min(train), np.min(test)) == 0
    return train, test


def get_win_buyer(data):
    return np.argmax(data, axis=0)


def boradcast_const_pred(pred, k, num_test=NUM_TEST):
    return np.tile(pred, reps=num_test).reshape(num_test, -1)


def get_mean_pred(train, k, num_test):
    return boradcast_const_pred(train.mean(1).argsort()[::-1][:k], k, num_test)


def recall(pred, ans):
    assert len(pred) == len(ans)
    return np.array([a in topk for a, topk in zip(ans, pred)]).mean()


def get_user_features(train, n_components=128, method='NMF'):
    mf_model = eval(method)(n_components=n_components)
    user_features = mf_model.fit_transform(train)
    return user_features


def get_cls_pred(train, user_features, k, n_pick_per_cls):
    n_clusters = k // n_pick_per_cls
    cls_model = KMeans(n_clusters=n_clusters, n_jobs=-1).fit(user_features)

    bucket = [[] for _ in range(cls_model.n_clusters)]
    pred = []
    for u_id, c_id in enumerate(cls_model.predict(user_features)):
        bucket[c_id].append(u_id) 
    for user_ids in bucket:
        user_ids = np.array(user_ids)
        pred.extend(user_ids[train[user_ids].mean(1).argsort()[-n_pick_per_cls:]])
    return pred


def get_gmm_pred(train, user_features, k, n_pick_per_cls):
    n_components = k // n_pick_per_cls
    gmm = GaussianMixture(n_components=n_components).fit(user_features)
    prob = gmm.predict_proba(user_features)
    
    bucket = [[] for _ in range(gmm.n_components)]
    pred = []
    for u_id, c_id in enumerate(gmm.predict(user_features)):
        bucket[c_id].append(u_id) 
    for user_ids in bucket:
        user_ids = np.array(user_ids)
        pred.extend(user_ids[train[user_ids].mean(1).argsort()[-n_pick_per_cls:]])
    return pred


def result(pred_dict, test, k=10):
#     assert all([len(pred[0]) == k for _, pred in pred_dict.items()])
    ans = get_win_buyer(test)
    pred_random_test = np.random.randint(k, size=(len(test[0]), k))

    recall_random_test = recall(pred_random_test, ans)
    recall_random_theory = k / len(test)

    print(f"Recall@{k}\n"
          f"Random:\t\t\t{100 * recall_random_test:.6f}%\n"
          f"Random (theory):\t{100 * recall_random_theory:.6f}%\n" +
          f"".join([f"{name}\t\t\t{100 * recall(boradcast_const_pred(pred, k, len(test[0])), ans):.6f}%\n" for name, pred in pred_dict.items()]))

# Basic

In [2]:
train = read_price('Auction/basic-train.txt')
test = read_price('Auction/basic-test.txt')
train, test = scale_to_origin(train, test)

In [3]:
user_features = get_user_features(train, n_components=64, method='NMF')

In [4]:
n_pick_per_cls = 2
for k in [1, 3, 5, 10, 20, 30]:
    k *= n_pick_per_cls
    result({'Mean': get_mean_pred(train, k, len(test[1])),
            'Cls': get_cls_pred(train, user_features, k, n_pick_per_cls),
            'Gmm': get_gmm_pred(train, user_features, k, n_pick_per_cls)
           },
           test,
           k=k)

Recall@2
Random:			0.000000%
Random (theory):	0.400000%
Mean			5.405405%
Cls			5.405405%
Gmm			5.405405%

Recall@6
Random:			1.101101%
Random (theory):	1.200000%
Mean			10.210210%
Cls			10.810811%
Gmm			9.209209%

Recall@10
Random:			1.501502%
Random (theory):	2.000000%
Mean			15.315315%
Cls			13.313313%
Gmm			14.114114%

Recall@20
Random:			2.702703%
Random (theory):	4.000000%
Mean			22.622623%
Cls			18.018018%
Gmm			20.720721%

Recall@40
Random:			6.106106%
Random (theory):	8.000000%
Mean			33.733734%
Cls			29.429429%
Gmm			31.131131%

Recall@60
Random:			7.007007%
Random (theory):	12.000000%
Mean			42.642643%
Cls			36.336336%
Gmm			32.532533%



# Advance

In [5]:
train = read_price('Auction/advanced-train.txt')
test = read_price('Auction/advanced-test.txt')
cat_train = read_category('Auction/advanced-train-category.txt')
cat_test = read_category('Auction/advanced-test-category.txt')

train, test = scale_to_origin(train, test)

train = {k: train[:, cat_train==k] for k in [1, 2, 3]}
# test = {k: test[:, cat_test==k] for k in [1, 2, 3]}

display(pandas.Series(cat_train).value_counts())
display(pandas.Series(cat_test).value_counts())

1    677
3    669
2    654
dtype: int64

3    342
2    334
1    323
dtype: int64

In [6]:
method = 'NMF'
n_components = 32
user_features = {k: get_user_features(mat, n_components, method) for k, mat in train.items()}

In [7]:
def combine_mean(k):
    pred = np.empty((NUM_TEST, k))
    pred[cat_test==1] = get_mean_pred(train[1], k, sum(cat_test==1))
    pred[cat_test==2] = get_mean_pred(train[2], k, sum(cat_test==2))
    pred[cat_test==3] = get_mean_pred(train[3], k, sum(cat_test==3))
    return pred

def combine_cls(k, n_pick_per_cls):
    pred = np.empty((NUM_TEST, k))
    for part in [1, 2, 3]:
        pred[cat_test==part] = get_cls_pred(train[part], user_features[part], k, n_pick_per_cls)
    return pred

def combine_gmm(k, n_pick_per_cls):
    pred = np.empty((NUM_TEST, k))
    for part in [1, 2, 3]:
        pred[cat_test==part] = get_gmm_pred(train[part], user_features[part], k, n_pick_per_cls)
    return pred

n_pick_per_cls = 2
for k in [1, 3, 4, 5, 10, 20]:
    k *= n_pick_per_cls
    result({'Mean': combine_mean(k),
            'Cls': combine_cls(k, n_pick_per_cls),
            'Gmm': combine_gmm(k, n_pick_per_cls)
           },
           test,
           k=k)

Recall@2
Random:			0.400400%
Random (theory):	0.400000%
Mean			4.104104%
Cls			4.104104%
Gmm			4.104104%

Recall@6
Random:			0.200200%
Random (theory):	1.200000%
Mean			9.109109%
Cls			8.808809%
Gmm			8.408408%

Recall@8
Random:			1.101101%
Random (theory):	1.600000%
Mean			11.311311%
Cls			10.710711%
Gmm			11.011011%

Recall@10
Random:			0.700701%
Random (theory):	2.000000%
Mean			13.113113%
Cls			12.112112%
Gmm			11.911912%

Recall@20
Random:			2.302302%
Random (theory):	4.000000%
Mean			20.120120%
Cls			19.719720%
Gmm			20.120120%

Recall@40
Random:			4.304304%
Random (theory):	8.000000%
Mean			30.630631%
Cls			29.029029%
Gmm			28.528529%

