In [100]:
# Import python libraries
import re
import time
import importlib

import numpy as np
import pandas as pd
import dask.dataframe as dd

from dask import delayed
from collections import defaultdict
from matplotlib import pyplot as plt
from typing import Any, Dict, List, Set, Tuple
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


# Import project modules
import kdd_processor
importlib.reload(kdd_processor)
from kdd_processor import handle_data


# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [101]:
# Housekeeping ==> THIS GENERALLY STAYS THE SAME
SEED, NUM_RECOMMENDATIONS = 183, 10
task, task1_locales = 'task1', ['UK', 'DE', 'JP']
NUM_CORES, NUM_THREADS = 8, 16
data_path, output_path = '../../data/', '../../outputs/'
train_path, test_path = data_path + 'train/', data_path + 'test/'
output_file = output_path + task + '_predictions.parquet'
project_files = ['products_train.csv', 'sessions_train.csv', f'sessions_test_{task}.csv']
prod_dtypes = {
    'id': 'object',
    'locale': 'object',
    'title': 'object',
    'price': 'float64',
    'brand': 'object',
    'color': 'object',
    'size': 'object',
    'model': 'object',
    'material': 'object',
    'author': 'object',
    'desc': 'object'
}
sess_dtypes = {
    'session_id': 'int32'
}

In [102]:
# PARTITION SWITCH ==> CHANGE AS NEEDED
# Specify partitions, ids and sampling fractions here
num_partitions = 5000              # num. of subsets: None will automatically discard partition_ids below
partition_ids = {                   # partition id(s): 'all', or any one of {integer = n, range = range(0, 6), list = [5, 19]} 
    'products_train': [0, 1, 2, 3, 4, 5],
    'sessions_train': [0, 1, 2, 3, 4, 5],
    'sessions_test': 'all'
}
fraction = 1                       # fraction of each partition to sample from, 1 will not sample


# LOAD DATA HERE. This generally stays the same
products_train, sessions_train, sessions_test = handle_data(
    project_files,
    [train_path, train_path, test_path],
    task,
    task1_locales,
    num_partitions,
    partition_ids,
    fraction,
    SEED,
    prod_dtypes,
    sess_dtypes
)

Processed products file found at: ../../data/train/. Loading it now...
Processed sessions file found at: ../../data/train/. Loading it now...
Processed sessions test file found at: ../../data/test/. Loading it now...


### Dask
We will be using Dask, as it is a great tool for parallel computing.

Below are some examples. CAUTION: Dask dframes take time to compute(), uncomment as required

Resources: https://www.youtube.com/@Dask-dev/videos

In [103]:
# # # CAUTION: Dask dframes take time to compute, uncomment as required
# products_train.head(2)

In [104]:
# sessions_train.head(2)

In [105]:
# sessions_test.head(2)

In [106]:
# products_train.compute().info()

In [107]:
# sessions_train.compute().info()

In [108]:
# sessions_test.compute().info()

In [109]:
# products_train.compute().shape

In [110]:
# sessions_train.compute().shape

In [111]:
# sessions_test.compute().shape

Retrieving details for a sample product

In [112]:
# # Details for a product
# view_product = 'B06XKPB3GT'
# products_train[products_train['id'] == view_product].compute()

#### Identify product involvements
We will be identifying products that are in both products_train and sessions_train. This will be USEFUL while getting recommendations later on.

In [113]:
# Function to analyze dataframes
def analyze_dataframes(
    products_train: dd.DataFrame, 
    sessions_train: dd.DataFrame
) -> Tuple[pd.Series, pd.Series, Dict[str, int], Dict[str, List[int]]]:
    
    # 1. Identify unique product ids in products_train
    prod_in_pt = products_train['id'].unique().compute()

    # 2. Identify unique product ids in sessions_train['prev_items']
    sessions_train = sessions_train.assign(prev_items=sessions_train['prev_items'].str.split(','))
    prod_in_st = sessions_train['prev_items'].explode().unique().compute()

    # 3. Identify rows where each unique id in prod_in_pt can be found in products_train
    products_train = products_train.reset_index().rename(columns={'index': 'row'})
    prod_in_pt_rows = products_train[products_train['id'].isin(prod_in_pt)][['row', 'id']].compute().set_index('id')['row'].to_dict()

    # 4. Identify rows where each unique id in prod_in_st occurs in sessions_train['prev_items'] and count occurrences
    exploded_sessions_train = sessions_train.explode('prev_items').reset_index().rename(columns={'index': 'row'})
    unique_pairs = exploded_sessions_train[['row', 'prev_items']].drop_duplicates()
    prod_in_st_occs = unique_pairs[unique_pairs['prev_items'].isin(prod_in_st)].groupby('prev_items')['row'].apply(list, meta=('row', 'f8')).compute().to_dict()
    
    return prod_in_pt, prod_in_st, prod_in_pt_rows, prod_in_st_occs

# Function to view common products
def view_common_products(
    products_train: dd.DataFrame,
    sessions_train: dd.DataFrame,
    prod_pt: pd.Series,
    prod_st: pd.Series,
    prod_pt_rows: Dict[str, int],
    prod_st_occs: Dict[str, List[int]]
) -> pd.DataFrame:

    # Calculate common product ids
    common_ids = set(prod_pt) & set(prod_st)

    # Function to calculate partition number
    def get_partition(row_index, partition_size):
        return row_index // partition_size

    app_rows = []

    # Calculate partition sizes
    partition_size_products = len(products_train) // len(partition_ids['products_train'])
    partition_size_sessions = len(sessions_train) // len(partition_ids['sessions_train'])

    for common_id in common_ids:
        count_in_sessions = len(prod_st_occs[common_id])
        row_in_products = prod_pt_rows[common_id] + 1
        app_data = prod_st_occs[common_id]
        partition_numbers = [get_partition(x, partition_size_sessions) for x in app_data]

        app_rows.append([common_id, None, row_in_products, ",".join(map(str, partition_numbers)), ",".join(map(str, [x + 1 for x in app_data])), count_in_sessions])

    # Create a DataFrame to store the results
    app_df = pd.DataFrame(app_rows, columns=['id', 'part_prod', 'row_prod', 'part_sess', 'row_sess', 'count_sess'])
    app_df['part_prod'] = app_df['row_prod'].apply(lambda x: get_partition(x, partition_size_products))
    app_df = app_df[['id', 'part_prod', 'row_prod', 'part_sess', 'row_sess', 'count_sess']]
    app_df = app_df.sort_values(by=['row_prod']).reset_index(drop=True)
    
    return app_df

# Start timer
start_time = time.time()

# Analyze dataframes
prod_pt, prod_st, prod_pt_rows, prod_st_occs = analyze_dataframes(products_train, sessions_train)

# Intermediate timer
inter_time = time.time()
print(f'Extracting product occurrences took: {inter_time - start_time} seconds')

# View common products
common_products = view_common_products(products_train, sessions_train, prod_pt, prod_st, prod_pt_rows, prod_st_occs)

# End timer
end_time = time.time()
print(f'Retrieving common products took {end_time - inter_time} seconds')
print(f'{len(common_products)} common products in products_train and sessions_train, for partitions: {partition_ids["products_train"]}, {partition_ids["sessions_train"]}')

common_products

Extracting product occurrences took: 5.819148778915405 seconds
Retrieving common products took 2.6650331020355225 seconds
48 common products in products_train and sessions_train, for partitions: [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]


Unnamed: 0,id,part_prod,row_prod,part_sess,row_sess,count_sess
0,B06XKPB3GT,0,13,0,209,1
1,B07PNR4KM2,0,38,4,3216,1
2,B08ND18XT6,0,107,3,2257,1
3,B01M10T4BK,0,216,3,2264,1
4,B005HIMQPW,0,220,1,784,1
5,B096ZVBH6B,0,248,1,1323,1
6,B09PDH7LLC,0,266,5,3574,1
7,B0752MLDLH,0,277,5,3605,1
8,B09NCBGS7R,1,330,14,13793040,2
9,B09N8JNLPJ,1,332,1,969,1


In [114]:
common_products[common_products['id'] == 'B07QPV9Z7X']

Unnamed: 0,id,part_prod,row_prod,part_sess,row_sess,count_sess


## Item-based Collaborative Filtering
Item-based collaborative filtering is a recommendation technique that uses the similarities between items to recommend similar products to users. For our case, item-based collaborative filtering can be used to recommend products based on their similarity in terms of features like product title, description, brand, or price. The intuition is that if a user engaged with a particular item, they are more likely to engage with similar items.

<img src='../../img/cf.png' width=600>

**Model 1**

This is an item-based collaborative filtering model that combines product features to create a content-based representation of each product. The model computes item-item similarity using the cosine similarity of the TF-IDF vectors of the combined features. The resulting similarity matrix is then used to create a dictionary of item similarities, which can be used to recommend similar items to a given item.

In [115]:
# def item_based_cf_model_1(
#     products_train: dd.DataFrame, 
#     sessions_train: dd.DataFrame, 
#     hyperparams: Dict[str, Any], 
#     comb_features: List[str]
# ) -> Dict[str, Dict[str, float]]:

#     # Combine features from comb_features
#     def combine_features(row: pd.Series, comb_features: List[str]) -> str:
#         combined_features = ""
#         for feature in comb_features:
#             if feature in row and not pd.isnull(row[feature]):
#                 combined_features += " " + row[feature]
#         return combined_features.strip()

#     # Compute item features
#     products_train["combined_features"] = products_train.apply(combine_features, axis=1, meta=('combined_features', 'object'), comb_features=comb_features)
#     item_features = products_train[["id", "combined_features"]].compute()

#     # Compute item-item similarity matrix using the TF-IDF vectorizer
#     vectorizer = TfidfVectorizer(
#         min_df=hyperparams.get("min_df", 2),
#         max_df=hyperparams.get("max_df", 0.8),
#         ngram_range=hyperparams.get("ngram_range", (1, 3))
#     )
#     combined_features_vectors = vectorizer.fit_transform(item_features["combined_features"])
#     similarity_matrix = cosine_similarity(combined_features_vectors)

#     # Create item-item similarity dictionary
#     item_similarity_dict = defaultdict(dict)
#     for i in range(len(item_features)):
#         for j in range(len(item_features)):
#             item_id_1 = item_features.iloc[i]["id"]
#             item_id_2 = item_features.iloc[j]["id"]
#             item_similarity_dict[item_id_1][item_id_2] = similarity_matrix[i, j]

#     return item_similarity_dict

**Model 2**

Model 2 is similar to Model 1 in that it combines product features to create a content-based representation of each product and computes item-item similarity using cosine similarity of the TF-IDF vectors. However, Model 2 introduces a key difference: it sorts the recommendations by similarity score and keeps only the top N most similar items for each item. This approach makes the recommendations more focused, keeping only the most relevant similar items for each given item.

In [116]:
# def item_based_cf_model_2(
#     products_train: dd.DataFrame, 
#     sessions_train: dd.DataFrame, 
#     hyperparams: Dict[str, Any], 
#     top_n: int
# ) -> Dict[str, Dict[str, float]]:

#     # Combine features from hyperparams['feat_combine']
#     def combine_features(row: pd.Series) -> str:
#         comb_features = hyperparams['feat_combine']
#         combined_features = ""
#         for feature in comb_features:
#             if feature in row and not pd.isnull(row[feature]):
#                 combined_features += " " + row[feature]
#         return combined_features.strip()

#     # Compute item features
#     products_train["combined_features"] = products_train.apply(combine_features, axis=1, meta=('combined_features', 'object'))
#     item_features = products_train[["id", "combined_features"]].compute()

#     # Compute item-item similarity matrix using the TF-IDF vectorizer
#     vectorizer = TfidfVectorizer(
#         min_df=hyperparams.get("min_df", 2),
#         max_df=hyperparams.get("max_df", 0.8),
#         ngram_range=hyperparams.get("ngram_range", (1, 3))
#     )
#     combined_features_vectors = vectorizer.fit_transform(item_features["combined_features"])
#     similarity_matrix = cosine_similarity(combined_features_vectors)

#     # Create item-item similarity dictionary
#     item_similarity_dict = defaultdict(dict)
#     for i in range(len(item_features)):
#         for j in range(len(item_features)):
#             item_id_1 = item_features.iloc[i]["id"]
#             item_id_2 = item_features.iloc[j]["id"]
#             item_similarity_dict[item_id_1][item_id_2] = similarity_matrix[i, j]

#         # Sort the recommendations by similarity score and take the top_n most similar items
#         sorted_recommendations = sorted(item_similarity_dict[item_id_1].items(), key=lambda x: x[1], reverse=True)[:top_n + 1]
#         item_similarity_dict[item_id_1] = dict(sorted_recommendations[1:])  # Exclude the first item (product itself)

#     return item_similarity_dict

**Model 3**

Model 3 builds upon Model 2 by adding flexibility in selecting the items to be considered for recommendations. Instead of using all the items, Model 3 allows the user to choose which items to include in the recommendation process: all items, only items from the products dataset, or only items from the sessions dataset. This flexibility enables the user to tailor the recommendations to specific use-cases or business requirements. The remaining steps of combining features, computing item-item similarity, and keeping the top N most similar items for each item remain the same as in Model 2.

In [117]:
def item_based_cf_model_3(
    products_train: dd.DataFrame, 
    sessions_train: dd.DataFrame, 
    hyperparams: Dict[str, Any], 
    top_n: int
) -> Dict[str, Dict[str, float]]:

    # Get unique products based on the 'incl_prod' parameter
    def get_unique_products(
        products_train: dd.DataFrame, 
        sessions_train: dd.DataFrame, 
        incl_prod: str
    ) -> Set[str]:
        sessions_train_pd = sessions_train.compute()
        if incl_prod == 'all':
            core_item_set = set(products_train['id'].unique()) | set(pd.Series(sessions_train_pd['prev_items'].str.split(',').sum()).unique())
        elif incl_prod == 'prod_only':
            core_item_set = set(products_train['id'].unique())
        elif incl_prod == 'sess_only':
            core_item_set = set(pd.Series(sessions_train_pd['prev_items'].str.split(',').sum()).unique())
        else:
            raise ValueError("Invalid value for 'incl_prod'. Choose from 'all', 'prod_only', or 'sess_only'.")
        return core_item_set

    # Combine features from hyperparams['feat_combine']
    def combine_features(row: pd.Series) -> str:
        comb_features = hyperparams['feat_combine']
        combined_features = ""
        for feature in comb_features:
            if feature in row and not pd.isnull(row[feature]):
                combined_features += " " + row[feature]
        return combined_features.strip()

    core_item_set = get_unique_products(products_train, sessions_train, hyperparams['incl_prod'])
    products_train = products_train[products_train['id'].isin(core_item_set)]

    # Compute item features
    products_train["combined_features"] = products_train.apply(combine_features, axis=1, meta=('combined_features', 'object'))
    item_features = products_train[["id", "combined_features"]].compute()

    if len(item_features) == 0:
        print("Error: item_features is empty.")
        return {}

    # Compute item-item similarity matrix using the TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        min_df=hyperparams.get("min_df", 2),
        max_df=hyperparams.get("max_df", 0.8),
        ngram_range=hyperparams.get("ngram_range", (1, 3))
    )
    combined_features_vectors = vectorizer.fit_transform(item_features["combined_features"])
    similarity_matrix = cosine_similarity(combined_features_vectors)

    # Create item-item similarity dictionary
    item_similarity_dict = defaultdict(dict)
    for i in range(len(item_features)):
        for j in range(len(item_features)):
            item_id_1 = item_features.iloc[i]["id"]
            item_id_2 = item_features.iloc[j]["id"]
            item_similarity_dict[item_id_1][item_id_2] = similarity_matrix[i, j]

        # Sort the recommendations by similarity score and take the top_n most similar items
        sorted_recommendations = sorted(item_similarity_dict[item_id_1].items(), key=lambda x: x[1], reverse=True)[:top_n + 1]
        item_similarity_dict[item_id_1] = dict(sorted_recommendations[1:])  # Exclude the first item (product itself)

    return item_similarity_dict

Helper function to retrieve top n recommendations across models

In [118]:
# Function to quickly retrieve top n recommendations for a given product
def view_recs(item_similarity_dict, prod_to_rec, n):
    # Check if prod_to_rec is in the item_similarity_dict
    if prod_to_rec not in item_similarity_dict:
        print(f'{prod_to_rec} not found in the training data')
        return

    # Get the top n similar items
    top_n_similar_items = sorted(item_similarity_dict[prod_to_rec].items(), key=lambda x: x[1], reverse=True)[:n]

    # Create a DataFrame to display the results
    df_recommendations = pd.DataFrame(top_n_similar_items, columns=["related_products", "score"])

    return df_recommendations

Model Training & Results

In [119]:
# # Hyperparameters for Model 1
# feat_combine = ['title', 'brand', 'color', 'size', 'model', 'material', 'author', 'desc']
# ibcf1_hyperparams = {
#     'min_df': 5,                # Minimum number of documents a word must be present in to be kept
#     'max_df': 0.8,              # Maximum % of documents a word can be present in to be kept
#     'ngram_range': (1, 3)       # (min_n, max_n) the higher the n, the more computationally expensive
#     }

# # Specify product to get recommendations for
# prod_to_rec = 'B005HIMQPW'

# # Train model
# start_time = time.time()
# ibcf1_recos = item_based_cf_model_1(products_train, sessions_train, ibcf1_hyperparams, feat_combine)
# end_time = time.time()
# print(f'{end_time - start_time} seconds')

# # View recommendations
# ibcf1_recos_df = view_recs(ibcf1_recos, prod_to_rec, NUM_RECOMMENDATIONS)
# print(f'Recommendations for {prod_to_rec}:')
# ibcf1_recos_df

222.8710389137268 seconds
Recommendations for B005HIMQPW:


Unnamed: 0,related_products,score
0,B005HIMQPW,1.0
1,B00HYZ493I,0.394435
2,B00967TBS0,0.389379
3,B008IGBJNK,0.318553
4,B01L7456JW,0.295305
5,B07V5HXBHD,0.278779
6,B0084D3RK0,0.271018
7,B0B8F4485Z,0.267133
8,B073ZJQLNR,0.265018
9,B004WO6GPI,0.263464


In [120]:
# # Hyperparameters for Model 2
# ibcf2_hyperparams = {
#     'min_df': 5,                # Minimum number of documents a word must be present in to be kept
#     'max_df': 0.8,              # Maximum % of documents a word can be present in to be kept
#     'ngram_range': (1, 3),      # (min_n, max_n) the higher the n, the more computationally expensive
#     'feat_combine': ['title', 'brand', 'color', 'size', 'model', 'material', 'author', 'desc']      # Combine this list of features
# }

# # Specify product to get recommendations for
# prod_to_rec = 'B005HIMQPW'

# # Train model
# start_time = time.time()
# ibcf2_recos = item_based_cf_model_2(products_train, sessions_train, ibcf2_hyperparams, NUM_RECOMMENDATIONS)
# end_time = time.time()
# print(f'{end_time - start_time} seconds')

# # View recommendations
# ibcf2_recos_df = view_recs(ibcf2_recos, prod_to_rec, NUM_RECOMMENDATIONS)
# print(f'Recommendations for {prod_to_rec}:')
# ibcf2_recos_df

224.22265648841858 seconds
Recommendations for B005HIMQPW:


Unnamed: 0,related_products,score
0,B00HYZ493I,0.394435
1,B00967TBS0,0.389379
2,B008IGBJNK,0.318553
3,B01L7456JW,0.295305
4,B07V5HXBHD,0.278779
5,B0084D3RK0,0.271018
6,B0B8F4485Z,0.267133
7,B073ZJQLNR,0.265018
8,B004WO6GPI,0.263464
9,B0BHWPR939,0.259679


In [121]:
# Hyperparameters for Model 3 (variant 1: incl_prod = 'all')
ibcf3_hyperparams = {
    'min_df': 5,                # Minimum number of documents a word must be present in to be kept
    'max_df': 0.8,              # Maximum % of documents a word can be present in to be kept
    'ngram_range': (1, 3),      # (min_n, max_n) the higher the n, the more computationally expensive
    'feat_combine': ['title', 'brand', 'color', 'size', 'model', 'material', 'author', 'desc'],     # Combine this list of features
    'incl_prod': 'all'          # Options: 'all', 'prod_only', 'sess_only'
}

# Specify product to get recommendations for
prod_to_rec = 'B005HIMQPW'

# Train model
start_time = time.time()
ibcf3_recos = item_based_cf_model_3(products_train, sessions_train, ibcf3_hyperparams, NUM_RECOMMENDATIONS)
end_time = time.time()
print(f'{end_time - start_time} seconds')

# View recommendations
ibcf3_recos_df = view_recs(ibcf3_recos, prod_to_rec, NUM_RECOMMENDATIONS)
print(f'Recommendations for {prod_to_rec}:')
ibcf3_recos_df

232.90454077720642 seconds
Recommendations for B005HIMQPW:


Unnamed: 0,related_products,score
0,B00HYZ493I,0.394435
1,B00967TBS0,0.389379
2,B008IGBJNK,0.318553
3,B01L7456JW,0.295305
4,B07V5HXBHD,0.278779
5,B0084D3RK0,0.271018
6,B0B8F4485Z,0.267133
7,B073ZJQLNR,0.265018
8,B004WO6GPI,0.263464
9,B0BHWPR939,0.259679


In [122]:
# Hyperparams for Model 3 (variant 2: incl_prod = 'prod_only')
ibcf3_hyperparams = {
    'min_df': 5,                # Minimum number of documents a word must be present in to be kept
    'max_df': 0.8,              # Maximum % of documents a word can be present in to be kept
    'ngram_range': (1, 3),      # (min_n, max_n) the higher the n, the more computationally expensive
    'feat_combine': ['title', 'brand', 'color', 'size', 'model', 'material', 'author', 'desc'],     # Combine this list of features
    'incl_prod': 'prod_only'    # Switch to include interacted products: 'all', 'prod_only', 'sess_only'
}

# Specify product to get recommendations for
prod_to_rec = 'B005HIMQPW'

# Train model
start_time = time.time()
ibcf3_recos = item_based_cf_model_3(products_train, sessions_train, ibcf3_hyperparams, NUM_RECOMMENDATIONS)
end_time = time.time()
print(f'{end_time - start_time} seconds')

# View recommendations
ibcf3_recos_df = view_recs(ibcf3_recos, prod_to_rec, NUM_RECOMMENDATIONS)
print(f'Recommendations for {prod_to_rec}:')
ibcf3_recos_df

236.1403248310089 seconds
Recommendations for B005HIMQPW:


Unnamed: 0,related_products,score
0,B00HYZ493I,0.394435
1,B00967TBS0,0.389379
2,B008IGBJNK,0.318553
3,B01L7456JW,0.295305
4,B07V5HXBHD,0.278779
5,B0084D3RK0,0.271018
6,B0B8F4485Z,0.267133
7,B073ZJQLNR,0.265018
8,B004WO6GPI,0.263464
9,B0BHWPR939,0.259679


In [123]:
# Hyperparams for Model 3 (variant 3: incl_prod = 'sess_only') ====> HAS SOME ISSUES, NEED TO FIX
ibcf3_hyperparams = {
    'min_df': 5,                # Minimum number of documents a word must be present in to be kept
    'max_df': 0.8,              # Maximum % of documents a word can be present in to be kept
    'ngram_range': (1, 3),      # (min_n, max_n) the higher the n, the more computationally expensive
    'feat_combine': ['title', 'brand', 'color', 'size', 'model', 'material', 'author', 'desc'],     # Combine this list of features
    'incl_prod': 'sess_only'    # Switch to include interacted products: 'all', 'prod_only', 'sess_only'
}

# Specify product to get recommendations for
prod_to_rec = 'B005HIMQPW'

# Train model
start_time = time.time()
ibcf3_recos = item_based_cf_model_3(products_train, sessions_train, ibcf3_hyperparams, NUM_RECOMMENDATIONS)
end_time = time.time()
print(f'{end_time - start_time} seconds')

# View recommendations
ibcf3_recos_df = view_recs(ibcf3_recos, prod_to_rec, NUM_RECOMMENDATIONS)
print(f'Recommendations for {prod_to_rec}:')
ibcf3_recos_df

3.257551908493042 seconds
Recommendations for B005HIMQPW:


Unnamed: 0,related_products,score
0,B096FNNDKD,0.738522
1,B09P8Z2QZX,0.71945
2,B07G446MMQ,0.586945
3,B09SG75YCD,0.497236
4,B09NCBGS7R,0.46957
5,B09GYWR3DH,0.406965
6,B07DFH35L8,0.364
7,B002TLLKW0,0.287767
8,B076VDWDZ2,0.163638
9,B088H32WBG,0.163279


In [126]:
# products_train[products_train['id'] == 'B005HIMQPW'].compute()

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc,combined_features
219,B005HIMQPW,DE,12 x glühlampe glühbirne kerze 40w e14 klar 40...,7.99,hillfield,klar weiss,10 stück 1er pack,35 l8hy wfdn,n a,n a,durchmesser ca 32 mm länge ca 97 mm hillfield ...,12 x glühlampe glühbirne kerze 40w e14 klar 40...


In [127]:
# products_train[products_train['id'] == 'B096FNNDKD'].compute()

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc,combined_features
1554,B096FNNDKD,DE,kulturen komplex probioglow 21 kulturen 30 mil...,49.95,yumiva,n a,30 stück 1er pack,n a,n a,n a,bakterienkulturen probioglow enthält 21 gensta...,kulturen komplex probioglow 21 kulturen 30 mil...


In [129]:
# products_train[products_train['id'] == 'B09P8Z2QZX'].compute()

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc,combined_features
770,B09P8Z2QZX,DE,durex pleasure me kondome mit rippen und noppe...,16.43,durex,n a,40 stück 1er pack,n a,latex,n a,optimale passform dank reservoir und anatomisc...,durex pleasure me kondome mit rippen und noppe...


In [130]:
# products_train[products_train['id'] == 'B07G446MMQ'].compute()

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc,combined_features
1421,B07G446MMQ,DE,nature love msm 2000mg mit vitamin c 365 labor...,19.95,nature love,n a,365 stück 1er pack,n a,n a,n a,echt naturbelassen unsere tabletten zeigen ein...,nature love msm 2000mg mit vitamin c 365 labor...
