#### Required Libraries

##### Polars

Polars is a blazingly fast DataFrame library implemented in Rust and runs on Python.
```bash
!pip install polars
```

##### Polars Distance

Polars distance is a separate package that provides additional distance functions for Polars. In our case, we will use it to calculate the Jaccard similarity between two lists.

```bash
!pip install polars-distance
```

`NOTE:` after installing Polars, you will need to restart the kernel.

In [3]:
import polars as pl
import polars_distance as pld

In [4]:
# Maximum length of strings that are displayed when you print a DataFrame
_ = pl.Config.set_fmt_str_lengths(100)

In [5]:
df_reviews = pl.read_ndjson("./Musical_Instruments_5.json")
df_reviews_meta = pl.read_ndjson("./meta_Musical_Instruments.json")

In [6]:
df_reviews = df_reviews.join(df_reviews_meta, on="asin", how="left").select(
    ["reviewerID", "asin", "title", "overall"]
)

In [7]:
# get the first 3000 unique reviewers
df_unique_reviewers_3000 = df_reviews["reviewerID"].unique().limit(3000)

In [8]:
# remove rows where reviewerID is not in first_3k_unique_reviewers
df_reviews = df_reviews.filter(pl.col("reviewerID").is_in(df_unique_reviewers_3000))

In [9]:
# Remove duplicates based on 'reviewerID' and 'asin'
df_reviews = df_reviews.unique(subset=["reviewerID", "asin"])

In [10]:
def discretize_rating(rating: int) -> str:
    """
    Discretize the rating into negative, positive, and average

    """

    polarity = "A"  # average

    if rating < 3:
        polarity = "N"  # negative
    elif rating > 3:
        polarity = "P"  # positive

    return polarity

In [11]:
df_reviews = df_reviews.with_columns(
    pl.col("overall")
    .map_elements(discretize_rating, return_dtype=pl.String)
    .alias("overall_discretized")
)

In [12]:
# Group by reviewerID and count the number of reviews, filter for more than 5 reviews and join back to the original DataFrame to filter it
df_reviews = (
    df_reviews.group_by("reviewerID")
    .agg(pl.count("reviewerID").alias("review_count"))
    .filter(pl.col("review_count") > 5)
    .join(df_reviews, on="reviewerID", how="inner")
)

In [13]:
# Create a list of products reviewed by each reviewer
df_reviewers_asin_list = df_reviews.group_by("reviewerID").agg(
    pl.col("asin").alias("asins")
)

# Create a cross join of all pairs of reviewers and filter out pairs where the reviewerID is the same
df_reviewer_asin_pairs = df_reviewers_asin_list.join(
    df_reviewers_asin_list, how="cross", suffix="_2"
).filter(pl.col("reviewerID") != pl.col("reviewerID_2"))

In [14]:
# We convert the pairs dataframe to a lazy dataframe in order calculate the Jaccard similarity simultaneously for all pairs
df_reviewer_asin_pairs = df_reviewer_asin_pairs.lazy()

In [17]:
# Calculate the Jaccard similarity between the lists of products reviewed by each pair of reviewers and collect the results
df_jaccard_sim = (
    df_reviewer_asin_pairs.with_columns(
        pld.col("asins").dist_list.jaccard_index("asins_2").alias("dist")
    )
    .filter(pl.col("reviewerID") != pl.col("reviewerID_2"))
    .collect()
)

df_jaccard_sim.head()

reviewerID,asins,reviewerID_2,asins_2,dist
str,list[str],str,list[str],f64
"""A1G0HYMR02WM2W""","[""B01C9KYUG8"", ""B0002OP7VQ"", … ""B019OO4IY6""]","""A1ZV6MJC2GKFPD""","[""B0002CZTI4"", ""B00CK2DM7Y"", … ""B00CL0I9LO""]",0.0
"""A1G0HYMR02WM2W""","[""B01C9KYUG8"", ""B0002OP7VQ"", … ""B019OO4IY6""]","""AEKOR8DBS4M1W""","[""B000LPWHSM"", ""B01E6T56EA"", … ""B00JYG7LIU""]",0.0
"""A1G0HYMR02WM2W""","[""B01C9KYUG8"", ""B0002OP7VQ"", … ""B019OO4IY6""]","""A11506QZDZPYGS""","[""B000PO30QM"", ""B00975F4WS"", … ""B00NARHNCS""]",0.0
"""A1G0HYMR02WM2W""","[""B01C9KYUG8"", ""B0002OP7VQ"", … ""B019OO4IY6""]","""A2WOF13694F0AC""","[""B00721Z05A"", ""B00SMKSHGY"", … ""B000JLU26W""]",0.0
"""A1G0HYMR02WM2W""","[""B01C9KYUG8"", ""B0002OP7VQ"", … ""B019OO4IY6""]","""A3QBYM820VKKVK""","[""B0006LOBA8"", ""B010GPFUI0"", … ""B0002E3CK4""]",0.0


In [18]:
def recommend(
    reviewer: str,
    df_jaccard_sim: pl.DataFrame,
    df_reviews: pl.DataFrame,
    jaccard_threshold: float,
    rec_num: int,
    verbose: bool = False,
):
    """
    Recommend products to a reviewer based on Jaccard similarity
    :param reviewer:
    :param df_jaccard_sim:
    :param df_reviews:
    :param jaccard_threshold:
    :param rec_num:
    :param verbose:
    :return:
    """

    # Filter the jaccard_sim dataframe to get neighbors above the similarity threshold
    neighbors = df_jaccard_sim.filter(
        (pl.col("reviewerID") == reviewer) & (pl.col("dist") >= jaccard_threshold)
    ).select(
        [pl.col("reviewerID_2").alias("neighbor"), pl.col("dist").alias("similarity")]
    )

    if neighbors.is_empty():
        if verbose:
            print("No neighbors found. Try lowering the threshold.")
        return pl.DataFrame()

    if verbose:
        print(f"Neighbors found: {neighbors.height}")

    # Join the neighbors dataframe with the reviews dataframe to get the reviews of neighbors
    neighbor_reviews = neighbors.join(
        df_reviews, left_on="neighbor", right_on="reviewerID"
    )

    # Create a scoring system based on the overall discretized ratings
    scores = {"P": 2, "A": -1, "N": -2}
    neighbor_reviews = neighbor_reviews.with_columns(
        (
            pl.col("overall_discretized").replace(scores, default=None)
            * pl.col("similarity")
        ).alias("weighted_score")
    )

    # Aggregate scores for each product
    product_scores = (
        neighbor_reviews.group_by("asin")
        .agg(pl.sum("weighted_score").alias("total_score"))
        .sort(["total_score"], descending=True)
    )

    # Filter out the products the reviewer has already reviewed
    reviewer_asin_products = (
        df_reviews.filter(pl.col("reviewerID") == reviewer)
        .select("asin")
        .to_series()
        .to_list()
    )

    recommended_products = product_scores.filter(
        ~pl.col("asin").is_in(reviewer_asin_products)
    )

    # Get the top N recommended products
    top_recommended = recommended_products.head(rec_num)

    if verbose:
        # Print recommendations
        for row in top_recommended.rows():
            product_title = (
                df_reviews.filter(pl.col("asin") == row[0])
                .select("title")
                .to_series()[0]
            )
            print(
                f"Recommended: {product_title} (Score: {row[1]})"
            )  # assuming 'total_score' is the second column

    return top_recommended

In [19]:
# reviewer who has more than 0.1 similarity and more than 5 neighbors
active_reviewer_id = (
    df_jaccard_sim.filter((pl.col("dist") > 0.1))
    .group_by("reviewerID")
    .agg(pl.count("reviewerID").alias("neighbors_count"))
    .sort("neighbors_count", descending=True)
    .filter(pl.col("neighbors_count") > 5)[0]["reviewerID"][0]
)

In [20]:
# recommend products for the active reviewer
recommend(active_reviewer_id, df_jaccard_sim, df_reviews, 0.2, 5, True)

Neighbors found: 158
Recommended: D'Addario EJ17-B25 Phosphor Bronze Acoustic Guitar Strings, Medium, 13-56, 25 Bulk Sets (Score: 9.0797782197163)
Recommended: D'Addario EJ26 Phosphor Bronze Acoustic Guitar Strings, Custom Light, 11-52 (Score: 9.0797782197163)
Recommended: DAddario Accessories Pro-Winder Guitar String Winder and Cutter  All-In-1 Restringing Tool  Includes Clippers, Bridge Pin Puller, Peg Winder  Designed to Fit Most Guitars (Score: 5.062893642305407)
Recommended: Fender 351 Shape Classic Picks (12 Pack) for electric guitar, acoustic guitar, mandolin, and bass (Score: 3.6002886002886)
Recommended: Elixir Strings Electric Guitar Strings w NANOWEB Coating, Light (.010-.046) (Score: 3.39010989010989)


asin,total_score
str,f64
"""B0007Y09VO""",9.079778
"""B0002H0A3S""",9.079778
"""B0002E1G5C""",5.062894
"""B0002E2SA4""",3.600289
"""B0002E1O3G""",3.39011
