# [02_BTB0.63523evaluation](https://www.kaggle.com/clustifier/btb-0-63523-evaluation/code)


### 1 understand apk & mapk
- [code](https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py)

```
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
```

- summary

```
    1. If acutual list has zero elements, then always return 1;
    2. If actual list has more than one elements, then the order of actual list doesn't matter; The final value will be avg of several single prediction results;
    3. If acutual list is not None, then the order of prediction list matters; 
    4. If max number is given, the predicted result will be shorten to len of max number.
```    

In [87]:
from ml_metrics import apk

print("zero elements")
print(apk([],[]))
print(apk([],[1,2]))
print("several elements")
print(apk([1,2],[1]))
print(apk([1,2],[10,1,2]))
print(apk([2,1],[10,1,2]))
print(apk([1,2,3,4],[10,1,2,3,4]))
print(apk([1,2,3,4],[1,2,3,4]))
print("order matter")
print(apk([1],[1,2,3,4]))
print(apk([2],[1,2,3,4]))
print(apk([3],[1,2,3,4]))
print(apk([4],[1,2,3,4]))
print("max number")
print(apk([4],[1,2,3,4],k=3))

zero elements
1.0
1.0
several elements
0.5
0.5833333333333333
0.5833333333333333
0.6791666666666667
1.0
order matter
1.0
0.5
0.3333333333333333
0.25
max number
0.0


In [86]:
# try to rewrite the apk function
def apk(actual,predicted,k=10):
    if len(predicted)>k:
        predicted=predicted[:k]    
    if not actual:
        return 1.0
    num_hits=0.0
    score=0.0
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits+=1.0
            score+= num_hits /(i+1)
    return score/min(len(actual),k)


def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

            
print("zero elements")
print(apk([],[]))
print(apk([],[1,2]))
print("several elements")
print(apk([1,2],[1]))
print(apk([1,2],[10,1,2]))
print(apk([2,1],[10,1,2]))
print(apk([1,2,3,4],[10,1,2,3,4]))
print(apk([1,2,3,4],[1,2,3,4]))
print("order matter")
print(apk([1],[1,2,3,4]))
print(apk([2],[1,2,3,4]))
print(apk([3],[1,2,3,4]))
print(apk([4],[1,2,3,4]))
print("max number")
print(apk([4],[1,2,3,4],k=3))

zero elements
0.0
0.0
several elements
0.5
0.5833333333333333
0.5833333333333333
0.6791666666666667
1.0
order matter
1.0
0.5
0.3333333333333333
0.25
max number
0.0


## Original Code 

In [None]:
import pandas as pd
import numpy as np 

reg = 12 # trying anokas idea of regularization
eval = True

train = pd.read_csv("../../data/unzip_data/clicks_train.csv")

if eval:
	ids = train.display_id.unique()
	ids = np.random.choice(ids, size=len(ids)//10, replace=False)

	valid = train[train.display_id.isin(ids)]
	train = train[~train.display_id.isin(ids)]
	
	print (valid.shape, train.shape)

cnt = train[train.clicked==1].ad_id.value_counts()
cntall = train.ad_id.value_counts()
del train

def get_prob(k):
    if k not in cnt:
        return 0
    return cnt[k]/(float(cntall[k]) + reg)

def srt(x):
    ad_ids = map(int, x.split())
    ad_ids = sorted(ad_ids, key=get_prob, reverse=True)
    return " ".join(map(str,ad_ids))
   
if eval:
	from ml_metrics import mapk
	
	y = valid[valid.clicked==1].ad_id.values
	y = [[_] for _ in y]
	p = valid.groupby('display_id').ad_id.apply(list)
	p = [sorted(x, key=get_prob, reverse=True) for x in p]
	
	print (mapk(y, p, k=12))
else:
	subm = pd.read_csv("../../data/unzip_data/sample_submission.csv") 
	subm['ad_id'] = subm.ad_id.apply(lambda x: srt(x))
	subm.to_csv("subm_reg_1.csv", index=False)

##  Rewrite the code

- It is not very efficient of the sorted method.

In [122]:
import numpy as np
import pandas as pd
vaild=True
L=12
# NROW=-1

train = pd.read_csv("../../data/unzip_data/clicks_train.csv")
cnt = train[train.clicked==1].ad_id.value_counts()
cntall = train.ad_id.value_counts()
M=train.clicked.mean()

def get_prob(k):
    if k not in cnt:
        return M
    return (L+cnt[k])/(cntall[k]+L)

def srt(x):
    ad_ids=map(int,x.split())
    ad_ids=sorted(ad_ids,key=get_prob,reverse=True)
    return " ".join(map(str,ad_ids))
        
subm = pd.read_csv("../../data/unzip_data/sample_submission.csv") 
subm['ad_id'] = subm.ad_id.apply(lambda x: srt(x))
subm.to_csv("subm_reg.csv".format(L), index=False)        


In [123]:
%%bash
kaggle competitions submit -c outbrain-click-prediction -f subm_reg.csv -m "BTB0.63523evaluation https://www.kaggle.com/clustifier/btb-0-63523-evaluation/code"

Successfully submitted to Outbrain Click Prediction