In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for plotting the data 
import seaborn as sns # Advanced data plotting on top of matplotlib
import os


from PIL import Image
import pickle

In [2]:
customer_test_df = pd.read_csv("../input/fork-of-h-m-recsys-eda-and-split/customer_test.csv")
customer_test_df_open_transactions = pd.read_csv("../input/fork-of-h-m-recsys-eda-and-split/customer_test_df_open_transactions.csv")
customer_test_df_hiden_transactions = pd.read_csv("../input/fork-of-h-m-recsys-eda-and-split/customer_test_df_hiden_transactions.csv")


In [3]:
# text preds
with open ('../input/h-m-recsys-nlp-description-similarity/text_preds', 'rb') as fp:
    nlp_preds = pickle.load(fp)

# image preds
with open ('../input/h-m-recsys-using-image-similarity/img_preds', 'rb') as fp:
    img_preds = pickle.load(fp)
    
# lightfm preds
with open ('../input/h-m-recsys-lightfm/lightfm_preds', 'rb') as fp:
    lightfm_preds = pickle.load(fp)
    
# lightfm + item features preds
with open ('../input/fork-of-h-m-recsys-lightfm-with-clusterization/lightfm_plus_preds', 'rb') as fp:
    lightfm_if_preds = pickle.load(fp)

# lightfm + item features + user featues preds
with open ('../input/fork-of-h-m-recsys-lightfm-with-clusterization/lightfm_plus_plus_preds', 'rb') as fp:
    lightfm_if_uf_preds = pickle.load(fp)


In [4]:
# hiden preds

ground_truth_list = []

# go through all customers from test list
for k, customer in enumerate(customer_test_df.customer_id):
    ground_truth_list.append(np.array(
        customer_test_df_hiden_transactions[customer_test_df_hiden_transactions.customer_id == customer].article_id))
    
print(len(ground_truth_list))

500


In [5]:
transactions_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [6]:
# 100 most popular items
most_popular_100 = transactions_df.groupby("article_id")["customer_id"].count().sort_values(ascending=False).head(100).index.values
most_popular_100[:12]

array([706016001, 706016002, 372860001, 610776002, 759871002, 464297007,
       372860002, 610776001, 399223001, 706016003, 720125001, 156231001])

In [7]:
def print_img(open_ids):
    img_list_open = []
    for id in open_ids:    
        image_path = "../input/h-and-m-personalized-fashion-recommendations/images/0{}/0{}.jpg".format(str(id)[:2], id)
        try:
            img = Image.open(image_path)
        except FileNotFoundError:
            continue            
        img_list_open.append(img)
    
    plt.figure(figsize=(2*len(img_list_open),5))
    for j in range(len(img_list_open)):
        plt.subplot(1, len(img_list_open), j+1)
        plt.axis('off')
        plt.imshow(img_list_open[j])

In [8]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

def jaccard(actual, predicted):
    return np.mean([jaccard_similarity(a,b) for a,b in zip(actual, predicted)])

def dice_similarity(list1, list2):
    intersection = 2 * len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) 
    return float(intersection) / union

def dice(actual, predicted):
    return np.mean([dice_similarity(a,b) for a,b in zip(actual, predicted)])

def intersept(list1, list2):
    return np.sum([len(list(set(a).intersection(b))) for a,b in zip(list1, list2)])

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    #if not actual:
    #    return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])



def metrics(ground_truth, mp_preds):
    mp_interception = intersept(ground_truth_list, mp_preds)
    mp_jaccard_similarity = jaccard(ground_truth_list, mp_preds)
    mp_dice = dice(ground_truth_list, mp_preds)    
    mp_maok12 = mapk(ground_truth_list, mp_preds, 12)

    print("Interceptions: {} from {} ({:.4}%)".format(mp_interception, len(ground_truth) * 12, mp_interception*100/(len(ground_truth) * 12)) )
    print("Jaccard: {:.4}".format(mp_jaccard_similarity))
    print("Dice: {:.4}".format(mp_dice))
    print("Mean Average Precision @ 12: {:.4}".format(mp_maok12))
    
    return [mp_interception, mp_interception*100/(len(ground_truth) * 12), mp_jaccard_similarity, mp_dice, mp_maok12]


**100 most popular items results**

In [9]:
# 100 most popular items results
mp_preds = [most_popular_100 for i in range(500)]

print("100 most popular items results\n")
mp100_metrics = metrics(ground_truth_list, mp_preds)

100 most popular items results

Interceptions: 117 from 6000 (1.95%)
Jaccard: 0.00216
Dice: 0.004265
Mean Average Precision @ 12: 0.0004845


In [10]:
mp_preds_12 = [most_popular_100[:12] for i in range(500)]

print("12 most popular items results\n")
mp12_metrics = metrics(ground_truth_list, mp_preds_12)

12 most popular items results

Interceptions: 18 from 6000 (0.3%)
Jaccard: 0.001771
Dice: 0.003336
Mean Average Precision @ 12: 0.0004845


**Description Similarity method**

In [11]:
# 100 most popular items results

nlp_metrics =  metrics(ground_truth_list, nlp_preds)

Interceptions: 542 from 6000 (9.033%)
Jaccard: 0.01378
Dice: 0.02662
Mean Average Precision @ 12: 0.006861


In [12]:
nlp_preds_12 = [item[:12] for item in nlp_preds]

nlp12_metrics = metrics(ground_truth_list, nlp_preds_12)

Interceptions: 136 from 6000 (2.267%)
Jaccard: 0.01411
Dice: 0.02628
Mean Average Precision @ 12: 0.006861


**Lightfm**

In [13]:
print("Light FM without user/item features")
lfm_metrics = metrics(ground_truth_list, lightfm_preds)

print("\nLight FM with item features")
lfm_if_metrics = metrics(ground_truth_list, lightfm_if_preds)

print("\nLight FM with item and user features")
lfm_if_uf_metrics = metrics(ground_truth_list, lightfm_if_uf_preds)


Light FM without user/item features
Interceptions: 234 from 6000 (3.9%)
Jaccard: 0.01463
Dice: 0.02763
Mean Average Precision @ 12: 0.01053

Light FM with item features
Interceptions: 209 from 6000 (3.483%)
Jaccard: 0.01306
Dice: 0.02464
Mean Average Precision @ 12: 0.008217

Light FM with item and user features
Interceptions: 176 from 6000 (2.933%)
Jaccard: 0.01092
Dice: 0.02072
Mean Average Precision @ 12: 0.006615


In [14]:
lightfm_preds12 = [pred[:12] for pred in lightfm_preds]
lightfm_if_preds12 = [pred[:12] for pred in lightfm_if_preds]
lightfm_if_uf_preds12 = [pred[:12] for pred in lightfm_if_uf_preds]

print("Light FM top12 without user/item features")
lfm12_metrics = metrics(ground_truth_list, lightfm_preds12)

print("\nLight FM top12 with item features")
lfm12_if_metrics = metrics(ground_truth_list, lightfm_if_preds12)

print("\nLight FM top12 with item and user features")
lfm12_if_uf_metrics = metrics(ground_truth_list, lightfm_if_uf_preds12)

Light FM top12 without user/item features
Interceptions: 163 from 6000 (2.717%)
Jaccard: 0.01635
Dice: 0.03009
Mean Average Precision @ 12: 0.01053

Light FM top12 with item features
Interceptions: 125 from 6000 (2.083%)
Jaccard: 0.01243
Dice: 0.02284
Mean Average Precision @ 12: 0.008217

Light FM top12 with item and user features
Interceptions: 106 from 6000 (1.767%)
Jaccard: 0.01032
Dice: 0.01936
Mean Average Precision @ 12: 0.006615


**Image similarity method**

In [15]:
print("Image similarity method results\n")
imgall_metrics = metrics(ground_truth_list, img_preds)

Image similarity method results

Interceptions: 39 from 6000 (0.65%)
Jaccard: 0.00103
Dice: 0.002003
Mean Average Precision @ 12: 0.0006458


In [16]:
img_preds_first12 = [pred[:12] for pred in img_preds]

print("Image similarity method results\n")
img12_metrics = metrics(ground_truth_list, img_preds_first12)

Image similarity method results

Interceptions: 8 from 6000 (0.1333%)
Jaccard: 0.0009293
Dice: 0.001707
Mean Average Precision @ 12: 0.0006458


**All results**

In [17]:
all_preds = [np.concatenate(item) for item in zip(mp_preds, nlp_preds, img_preds, lightfm_preds, lightfm_if_preds, lightfm_if_uf_preds)]
#all_preds_top12 = [pd.value_counts(item).head(12).index.values for item in all_preds]
#pd.value_counts(all_preds[3]).head(12) 

In [18]:
all_preds_metrics = metrics(ground_truth_list, all_preds)
#print()
#all_preds_top12_metrics = metrics(ground_truth_list, all_preds_top12)

Interceptions: 960 from 6000 (16.0%)
Jaccard: 0.006441
Dice: 0.01274
Mean Average Precision @ 12: 0.0004845


In [19]:
# Метрики по общму количесвту кандидатов

pd.DataFrame([mp100_metrics, 
              nlp_metrics, 
              lfm_metrics,
              lfm_if_metrics, 
              lfm_if_uf_metrics,
              imgall_metrics, 
              all_preds_metrics,
              ],
             index = ["Top 100 most popular",
                      "Description similarity 12 for every item", 
                      "LightFM 24", 
                      "LightFM+ItemF 24", 
                      "LightFM+ItemF+UserF 24", 
                      "Image similarity 12 for every item", 
                      "All predictions"
                      ],
            columns = ["True Predicted", "Recall %", "Jaccard", "Dice", "MAP@12"])

Unnamed: 0,True Predicted,Recall %,Jaccard,Dice,MAP@12
Top 100 most popular,117,1.95,0.00216,0.004265,0.000484
Description similarity 12 for every item,542,9.033333,0.013783,0.026618,0.006861
LightFM 24,234,3.9,0.014634,0.027632,0.010527
LightFM+ItemF 24,209,3.483333,0.013059,0.024642,0.008217
LightFM+ItemF+UserF 24,176,2.933333,0.010919,0.020723,0.006615
Image similarity 12 for every item,39,0.65,0.00103,0.002003,0.000646
All predictions,960,16.0,0.006441,0.01274,0.000484


In [20]:
all12_preds = [np.concatenate(item) for item in zip(mp_preds_12, nlp_preds_12, img_preds_first12, lightfm_preds12, lightfm_if_preds12, lightfm_if_uf_preds12)]
all12_metrics = metrics(ground_truth_list, all12_preds)

Interceptions: 412 from 6000 (6.867%)
Jaccard: 0.01162
Dice: 0.02255
Mean Average Precision @ 12: 0.0004845


In [21]:
# Метрики для алгоритмов stand alone

pd.DataFrame([mp12_metrics,
              nlp12_metrics,
              lfm12_metrics,
              lfm12_if_metrics,
              lfm12_if_uf_metrics,
              img12_metrics,
              #all12_metrics
              ],
             index = ["Top 12 most popular",
                      "Description similarity first 12",
                      "LightFM 12",
                      "LightFM+ItemF 12",
                      "LightFM+ItemF+UserF 12",
                      "Image similarity first 12",
                      #"All 12 preds"
                      ],
            columns = ["True Predicted", "Recall %", "Jaccard", "Dice", "MAP@12"])

Unnamed: 0,True Predicted,Recall %,Jaccard,Dice,MAP@12
Top 12 most popular,18,0.3,0.001771,0.003336,0.000484
Description similarity first 12,136,2.266667,0.014106,0.026276,0.006861
LightFM 12,163,2.716667,0.016354,0.03009,0.010527
LightFM+ItemF 12,125,2.083333,0.012428,0.022835,0.008217
LightFM+ItemF+UserF 12,106,1.766667,0.010321,0.019365,0.006615
Image similarity first 12,8,0.133333,0.000929,0.001707,0.000646


In [22]:
# Подбор наилучшего соотношения весов для источников кандидатов
blends = []

blends.append([mp_preds, nlp_preds, img_preds, lightfm_preds, lightfm_if_preds, lightfm_if_uf_preds])
blends.append([mp_preds, nlp_preds, lightfm_preds, lightfm_if_preds, lightfm_if_uf_preds])
blends.append([mp_preds, nlp_preds,  nlp_preds, lightfm_preds, lightfm_preds, lightfm_if_uf_preds])
blends.append([ mp_preds, nlp_preds,  nlp_preds, lightfm_preds, lightfm_preds, lightfm_if_preds, lightfm_if_uf_preds])
blends.append([ mp_preds,  mp_preds, nlp_preds,  nlp_preds, lightfm_preds, lightfm_preds, lightfm_if_preds, lightfm_if_uf_preds])
blends.append([ mp_preds,  mp_preds, nlp_preds, img_preds,  nlp_preds, lightfm_preds, lightfm_preds, lightfm_if_preds, lightfm_if_uf_preds])
blends.append([ mp_preds,  mp_preds, nlp_preds, nlp_preds, img_preds, lightfm_preds, lightfm_preds, lightfm_if_preds, lightfm_if_preds])
blends.append([ mp_preds,  nlp_preds, nlp_preds, img_preds, lightfm_preds, lightfm_preds, lightfm_if_preds,])
blends.append([ nlp_preds, nlp_preds, img_preds, lightfm_preds, lightfm_if_preds, lightfm_if_preds])
blends.append([ mp_preds,  nlp_preds, nlp_preds, lightfm_preds, lightfm_if_preds, lightfm_if_preds])
blends.append([ mp_preds,  nlp_preds, nlp_preds, img_preds, lightfm_preds, lightfm_if_preds, lightfm_if_preds])


all_metrics = []
for blend in blends:
    print()
    all_preds = [np.concatenate(item) for item in zip(*blend)]
    all_preds_top12 = [pd.value_counts(item).head(12).index.values for item in all_preds]
    cur_metrics = metrics(ground_truth_list, all_preds_top12)
    all_metrics.append(cur_metrics)
    
#pd.value_counts(all_preds[3]).head(12) 


Interceptions: 166 from 6000 (2.767%)
Jaccard: 0.01666
Dice: 0.03067
Mean Average Precision @ 12: 0.009706

Interceptions: 157 from 6000 (2.617%)
Jaccard: 0.01542
Dice: 0.02867
Mean Average Precision @ 12: 0.008613

Interceptions: 170 from 6000 (2.833%)
Jaccard: 0.01694
Dice: 0.03144
Mean Average Precision @ 12: 0.01132

Interceptions: 174 from 6000 (2.9%)
Jaccard: 0.01731
Dice: 0.03215
Mean Average Precision @ 12: 0.01119

Interceptions: 176 from 6000 (2.933%)
Jaccard: 0.01761
Dice: 0.03261
Mean Average Precision @ 12: 0.01051

Interceptions: 179 from 6000 (2.983%)
Jaccard: 0.01793
Dice: 0.03324
Mean Average Precision @ 12: 0.0105

Interceptions: 186 from 6000 (3.1%)
Jaccard: 0.01897
Dice: 0.03491
Mean Average Precision @ 12: 0.01067

Interceptions: 189 from 6000 (3.15%)
Jaccard: 0.01894
Dice: 0.03499
Mean Average Precision @ 12: 0.01244

Interceptions: 160 from 6000 (2.667%)
Jaccard: 0.01615
Dice: 0.02967
Mean Average Precision @ 12: 0.01351

Interceptions: 188 from 6000 (3.133%)
Ja

In [23]:
# Тест метода с сохранением ранжрования всех претендентов 

all_pred_list = []
for user in zip(*[nlp_preds_12, lightfm_preds12]):
    #print(user)
    i = 0
    user_preds = []
    while(True):
        cur_pred = [pred[i] for pred in user]
        #print(cur_pred) 
        #print(len(cur_pred), len(np.unique(cur_pred))) 
        user_preds += list(np.unique(cur_pred))
        #print(user_preds) 
        user_preds = list(np.unique(user_preds))
        if len(user_preds) >= 12:
            break
        i+=1
        
    #print(user_preds)
            
    all_pred_list.append(user_preds)
    
cur_metrics = metrics(ground_truth_list, all_pred_list)

Interceptions: 174 from 6000 (2.9%)
Jaccard: 0.01754
Dice: 0.03263
Mean Average Precision @ 12: 0.006925


In [24]:
# Данные для итоговой таблицы

# Смесь в равных пропроциях
first_mix = [mp_preds, nlp_preds, img_preds, lightfm_preds, lightfm_if_preds, lightfm_if_uf_preds]
first_mix_preds = [np.concatenate(item) for item in zip(*first_mix)]
first_mix_predss_top12 = [pd.value_counts(item).head(12).index.values for item in first_mix_preds]
first_mix_predss_top12_metrics = metrics(ground_truth_list, first_mix_predss_top12)

# Лучший бленд 
# Most Popular + 2 x NLP Similarity + CV Similarity + LIGHTFM + 2 x LIGHTFM+ItemFeatures 
best_mix = [mp_preds,  nlp_preds, nlp_preds, img_preds, lightfm_preds, lightfm_if_preds, lightfm_if_preds]
best_mix_preds = [np.concatenate(item) for item in zip(*best_mix)]
best_mix_predss_top12 = [pd.value_counts(item).head(12).index.values for item in best_mix_preds]
best_mix_predss_top12_metrics = metrics(ground_truth_list, best_mix_predss_top12)

Interceptions: 166 from 6000 (2.767%)
Jaccard: 0.01666
Dice: 0.03067
Mean Average Precision @ 12: 0.009706
Interceptions: 199 from 6000 (3.317%)
Jaccard: 0.02005
Dice: 0.03697
Mean Average Precision @ 12: 0.01328


In [25]:
pd.DataFrame([mp12_metrics,
              nlp12_metrics,
              lfm12_metrics,
              lfm12_if_metrics,
              lfm12_if_uf_metrics,
              img12_metrics,
              first_mix_predss_top12_metrics,
              best_mix_predss_top12_metrics
              ],
             index = ["Top 12 most popular",
                      "Description similarity first 12",
                      "LightFM 12",
                      "LightFM+ItemF 12",
                      "LightFM+ItemF+UserF 12",
                      "Image similarity first 12",
                      "Микс с равынми весами",
                      "Лучшее соотношение весов"
                      ],
            columns = ["True Predicted", "Recall %", "Jaccard", "Dice", "MAP@12"])

Unnamed: 0,True Predicted,Recall %,Jaccard,Dice,MAP@12
Top 12 most popular,18,0.3,0.001771,0.003336,0.000484
Description similarity first 12,136,2.266667,0.014106,0.026276,0.006861
LightFM 12,163,2.716667,0.016354,0.03009,0.010527
LightFM+ItemF 12,125,2.083333,0.012428,0.022835,0.008217
LightFM+ItemF+UserF 12,106,1.766667,0.010321,0.019365,0.006615
Image similarity first 12,8,0.133333,0.000929,0.001707,0.000646
Микс с равынми весами,166,2.766667,0.01666,0.030668,0.009706
Лучшее соотношение весов,199,3.316667,0.02005,0.036969,0.013281
