# LAB 2 : User similarity, Item Similarity 

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/biodatlab/xlab-recommendation/blob/notebook/02_similarity.ipynb)

* Dataset: https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations
* Objective: To use similarity index (jaccard similarity and cosine similarity) and distance (euclidean distance) to find user similarity and item similarity 
* Libraries : Pandas, Scipy, Numpy
* Use 'transactions_train.csv' to find user similarity and use 'articles.csv' to find item similarity

## User Similarity

### Read dataset for find user similarity

In [None]:
! pip install pandas
! pip install scipy
! pip install numpy
! pip install sklearn
! pip install gdown

In [None]:
# import essential library
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import numpy as np
import os.path as op
import os
import gdown

In [None]:
url = "https://drive.google.com/drive/folders/1FfPaLQB-qnbNCPZakmRQs4kDHdRfJ7If?usp=drive_link"
gdown.download_folder(url, use_cookies=False)

In [None]:
# read dataset for find user similarity
file_name = "transactions.csv"
transactions_path = op.join(os.getcwd(), file_name)
transactions_df = pd.read_csv(transactions_path)

In [None]:
# check transactions_df
transactions_df.head()

### Select data for find user similarity

In [None]:
# select some data from transactions_df
selected_user_df = transactions_df[["customer_id", "article_id"]].sort_values(
    "customer_id", ignore_index=True
)

In [None]:
# check selected_user_df
selected_user_df.head()

In [None]:
# find unique customer_id from selected_user_df
print(f"unique customer_id : {len(pd.unique(selected_user_df['customer_id']))}")
# find unique article_id from selected_user_df
print(f"unique article_id : {len(pd.unique(selected_user_df['article_id']))}")

### Select data that customer have transaction greater than 500

In [None]:
# select data that customer have transaction greater than 500
customer_counts = selected_user_df["customer_id"].value_counts()
filtered_df = selected_user_df.loc[
    selected_user_df["customer_id"].isin(customer_counts[customer_counts > 500].index)
].drop_duplicates(["customer_id", "article_id"], ignore_index=True)

In [None]:
# find unique customer_id from filtered_df
print(f"unique customer_id : {len(pd.unique(filtered_df['customer_id']))}")
# find unique article_id from filtered_df
print(f"unique article_id : {len(pd.unique(filtered_df['article_id']))}")

In [None]:
# astype customer_id, article_id as catagory
filtered_df["customer_id"] = filtered_df["customer_id"].astype("category")
filtered_df["article_id"] = filtered_df["article_id"].astype("category")

# add interact to filtered_df
filtered_df["interact"] = 1

In [None]:
# check filtered_df
filtered_df.head()

### Create user-item matrix by using sparse matrix

In [None]:
X_user = csr_matrix(
    (
        filtered_df["interact"],
        (filtered_df["customer_id"].cat.codes, filtered_df["article_id"].cat.codes),
    )
)

### Find nearest neighbors of the user

In [None]:
k_user = 2  # Number of nearest neighbors to find
neighbors_model_user = NearestNeighbors(
    n_neighbors=k_user, metric="cosine"
)  # You can change the metric as needed

# Fit the model on your DataFrame
neighbors_model_user.fit(X_user)

In [None]:
# Find the nearest neighbors for a specific data point (e.g., row 0)
row_index_user = (
    0  # You can change cat_code of customer that you want to find nearest neighbors
)
query_point_user = X_user.toarray()[row_index_user].reshape(
    1, -1
)  # Convert the row to a 2D array
distances_user, indices_user = neighbors_model_user.kneighbors(query_point_user)

print(distances_user, indices_user)

In [None]:
nearest_customer_id = pd.Categorical.from_codes(indices_user[0], categories=filtered_df["customer_id"].cat.categories)
nearest_customer_id

## Item Similarity

### Read dataset for find item similarity

In [None]:
# read dataset for find item similarity
file_name = "articles.csv"
article_path = op.join(os.getcwd(), file_name)
article_df = pd.read_csv(article_path)

In [None]:
# check article_df
article_df.head()

### Select data for find item similarity

In [None]:
# select some data from article dataframe
selected_item_column = [
    "product_type_no",
    "graphical_appearance_no",
    "colour_group_code",
    "index_group_no",
    "section_no",
    "garment_group_no",
]
selected_item_df = article_df[selected_item_column]

In [None]:
# check selected_item_df
selected_item_df.head()

### Use one hot technique

In [None]:
dummies_df = pd.get_dummies(selected_item_df, columns=selected_item_column)
dummies_df.head()

### Find nearest neighbor of the item

In [None]:
k_article = 5  # Number of nearest neighbors to find
neighbor_model_article = NearestNeighbors(
    n_neighbors=k_article, metric="cosine"
)  # You can change the metric as needed

# Fit the model on your DataFrame
neighbor_model_article.fit(dummies_df)

In [None]:
# Find the nearest neighbors for a specific data point (e.g., row 0)
row_index_article = (
    100  # You can change index of article that you want to find nearest neighbors
)
query_point_article = dummies_df.iloc[row_index_article].values.reshape(
    1, -1
)  # Convert the row to a 2D array
distances_article, indices_article = neighbor_model_article.kneighbors(
    query_point_article
)

# Print the nearest neighbors and their distances
nearest_article_id = article_df.iloc[indices_article[0]]["article_id"]
print(f"Distances: {distances_article}")
print(f"Nearest Neighbors: {nearest_article_id}")