In [1]:
!pip install pyarrow
!pip install fastparquet



In [2]:
import numpy as np, pandas as pd, datetime as dt
import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns
from collections import defaultdict

def iter_to_str(iterable):
    return " ".join(map(lambda x: str(0) + str(x), iterable))

def apk(actual, predicted, k=12):
    if len(predicted) > k:
        predicted = predicted[:k]
    score, nhits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            nhits += 1.0
            score += nhits / (i + 1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(actual, predicted, k=12, return_apks=False):
    assert len(actual) == len(predicted)
    apks = [apk(ac, pr, k) for ac, pr in zip(actual, predicted) if 0 < len(ac)]
    if return_apks:
        return apks
    return np.mean(apks)

def blend(dt, w=[], k=12):
    if len(w) == 0:
        w = [1] * (len(dt))
    preds = []
    for i in range(len(w)):
        preds.append(dt[i].split())
    res = {}
    for i in range(len(preds)):
        if w[i] < 0:
            continue
        for n, v in enumerate(preds[i]):
            if v in res:
                res[v] += (w[i] / (n + 1))
            else:
                res[v] = (w[i] / (n + 1))    
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    return ' '.join(res[:k])

def prune(pred, ok_set, k=12):
    pred = pred.split()
    post = []
    for item in pred:
        if int(item) in ok_set and not item in post:
            post.append(item)
    return " ".join(post[:k])

def validation(actual, predicted, grouping, score=0, index=-1, ignore=False, figsize=(12, 6)):
    # actual, predicted : list of lists
    # group : pandas Series
    # score : pandas DataFrame
    
    vc = pd.Series(predicted).apply(len).value_counts()
    print("Fill Rate = ", round(1 - sum(vc[k] * (12 - k) / 12 for k in (set(range(12)) & set(vc.index))) / len(actual), 3) * 100)
    
    
    if ignore: return
    ap12 = mapk(actual, predicted, return_apks=True)
    map12 = round(np.mean(ap12), 6)
    if isinstance(score, int): score = pd.DataFrame({g:[] for g in sorted(grouping.unique().tolist())})
    if index == -1 : index = score.shape[0]
    score.loc[index, "All"] = map12
    plt.figure(figsize=figsize)
    plt.subplot(1, 2, 1); sns.histplot(data=ap12, log_scale=(0, 10), bins=20); plt.title(f"MAP@12 : {map12}")
    for g in grouping.unique():
        map12 = round(mapk(actual[grouping == g], predicted[grouping == g]), 6)
        score.loc[index, g] = map12
    plt.subplot(1, 2, 2); score[[g for g in grouping.unique()[::-1]] + ['All']].loc[index].plot.barh(); plt.title(f"MAP@12 of Groups")
    vc = pd.Series(predicted).apply(len).value_counts()
    score.loc[index, "Fill"] = round(1 - sum(vc[k] * (12 - k) / 12 for k in (set(range(12)) & set(vc.index))) / len(actual), 3) * 100
    display(score)
    return score

In [15]:
sub = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [16]:
sub0 = pd.read_csv("../input/hm-ensemble/submission_uucf0245.csv")
sub['sub0'] = sub0['prediction'].fillna("")
del sub0

In [17]:
sub1 = pd.read_csv("../input/hm-ensemble/submission-blend0249.csv")
sub['sub1'] = sub1['prediction'].fillna("")
del sub1

In [18]:
sub2 = pd.read_csv("../input/hm-ensemble/submission0243.csv")
sub['sub2'] = sub2['prediction'].fillna("")
del sub2

In [23]:
# sub3 = pd.read_csv("../input/hm-ensemble/submission-mtmt0903-0241.csv")
# sub['sub3'] = sub3['prediction'].fillna("")
# del sub3

In [24]:
# sub4 = pd.read_csv("../input/hm-ensemble/submission_aruaru0239.csv")
# sub['sub4'] = sub4['prediction'].fillna("")
# del sub4

In [25]:
sub.head(3)

Unnamed: 0,customer_id,prediction,sub0,sub1,sub2,sub3,sub4
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0751471001 0918890002 09...,0568601043 0568601006 0918890002 0919273002 07...,0568601043 0568601006 0751471001 0924243002 05...,0568601043 0568601006 0918890002 0907188001 09...,0568601043 0568601006 0924243001 0751471001 04...,0568601043 0568601006 0751471001 0656719005 05...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0739590027 0448509014 0673677002 07...,0826211002 0673677002 0448509014 0658298001 07...,0739590027 0826211002 0811835004 0732842008 07...,0826211002 0448509014 0781613006 0706016001 07...,0826211002 0924243001 0924243002 0739590027 08...,0826211002 0924243001 0739590027 0800436010 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0794321008 0762846027 0924243001 08...,0794321007 0794321008 0762846027 0751471001 08...,0794321007 0852643003 0924243001 0852643001 08...,0794321007 0794321008 0762846027 0751471001 08...,0794321007 0852643001 0852643003 0924243001 09...,0794321007 0794321011 0866731001 0852643001 08...


In [29]:
#0.0255
targets = ['sub0', 'sub1', 'sub2']
weights = [1,1.1,0.8]

# 0.0255↓
# targets = ['sub0', 'sub1', 'sub2', 'sub3', 'sub4' ]
# weights = [1,1.1,0.8,0.7,0.7]
sub['prediction'] = sub[targets].apply(blend, w=weights, axis=1, k=12)

In [27]:
sub.head(3)

Unnamed: 0,customer_id,prediction,sub0,sub1,sub2,sub3,sub4
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0751471001 0918890002 09...,0568601043 0568601006 0918890002 0919273002 07...,0568601043 0568601006 0751471001 0924243002 05...,0568601043 0568601006 0918890002 0907188001 09...,0568601043 0568601006 0924243001 0751471001 04...,0568601043 0568601006 0751471001 0656719005 05...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0739590027 0448509014 0924243001 07...,0826211002 0673677002 0448509014 0658298001 07...,0739590027 0826211002 0811835004 0732842008 07...,0826211002 0448509014 0781613006 0706016001 07...,0826211002 0924243001 0924243002 0739590027 08...,0826211002 0924243001 0739590027 0800436010 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0794321008 0852643003 0924243001 08...,0794321007 0794321008 0762846027 0751471001 08...,0794321007 0852643003 0924243001 0852643001 08...,0794321007 0794321008 0762846027 0751471001 08...,0794321007 0852643001 0852643003 0924243001 09...,0794321007 0794321011 0866731001 0852643001 08...


In [28]:
sub[['customer_id', 'prediction']].to_csv('submission_ensamble.csv', index=False)