In [4]:
import os
import numpy as np
import polars as pl

product_info = pl.read_csv('../Corpus/product.csv', separator='\t')
queries = pl.read_csv("../Corpus/query.csv", separator='\t')

queries.head()

query_id,query,query_class
i64,str,str
0,"""salon chair""","""Massage Chairs"""
1,"""smart coffee table""","""Coffee & Cocktail Tables"""
2,"""dinosaur""","""Kids Wall Décor"""
3,"""turquoise pillows""","""Accent Pillows"""
4,"""chair and a half recliner""","""Recliners"""


In [5]:
product_info.head()

product_id,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count
i64,str,str,str,str,str,f64,f64,f64
0,"""solid wood platform bed""","""Beds""","""Furniture / Bedroom Furniture …","""good , deep sleep can be quite…","""overallwidth-sidetoside:64.7|d…",15.0,4.5,15.0
1,"""all-clad 7 qt . slow cooker""","""Slow Cookers""","""Kitchen & Tabletop / Small Kit…","""create delicious slow-cooked m…","""capacityquarts:7|producttype :…",100.0,2.0,98.0
2,"""all-clad electrics 6.5 qt . sl…","""Slow Cookers""","""Kitchen & Tabletop / Small Kit…","""prepare home-cooked meals on a…","""features : keep warm setting|c…",208.0,3.0,181.0
3,"""all-clad all professional tool…","""Slicers, Peelers And Graters""","""Browse By Brand / All-Clad""","""this original stainless tool w…","""overallwidth-sidetoside:3.5|wa…",69.0,4.5,42.0
4,"""baldwin prestige alcott passag…","""Door Knobs""","""Home Improvement / Doors & Doo…","""the hardware has a rich herita…","""compatibledoorthickness:1.375 …",70.0,5.0,42.0


In [7]:
def concatenate_info(df: pl.DataFrame, query: bool) -> pl.DataFrame:
  # Create one expression to concatenate columns row-wise
  if not query:
    prod_expr = pl.concat_str([
      pl.lit("[CLS] "),
      pl.col("product_name").alias("name").cast(pl.Utf8).fill_null(""),
      pl.lit(" | "),
      pl.col("product_class").alias("class").cast(pl.Utf8).fill_null(""),
      pl.lit(" | "),
      pl.col("product_description").alias("description").cast(pl.Utf8).fill_null(""),
      pl.lit(" | "),
      pl.col("product_features").alias("features").cast(pl.Utf8).fill_null(""),
      pl.lit(" [SEP]"),
    ])

    return df.with_columns(prod_info=prod_expr)

  query_expr = pl.concat_str([
    pl.lit("[CLS] "),
    pl.col("query").alias("query").cast(pl.Utf8).fill_null(""),
    pl.lit(" | "),
    pl.col("query_class").alias("class").cast(pl.Utf8).fill_null(""),
    pl.lit(" [SEP]")
  ])

  return df.with_columns(query_info=query_expr)

product_info = concatenate_info(product_info, False)
queries = concatenate_info(queries, True)

In [9]:
queries.head()

query_id,query,query_class,query_info
i64,str,str,str
0,"""salon chair""","""Massage Chairs""","""[CLS] salon chair | Massage Ch…"
1,"""smart coffee table""","""Coffee & Cocktail Tables""","""[CLS] smart coffee table | Cof…"
2,"""dinosaur""","""Kids Wall Décor""","""[CLS] dinosaur | Kids Wall Déc…"
3,"""turquoise pillows""","""Accent Pillows""","""[CLS] turquoise pillows | Acce…"
4,"""chair and a half recliner""","""Recliners""","""[CLS] chair and a half recline…"


In [10]:
def create_documents(df: pl.DataFrame, filepath: str, query: bool) -> None:
  """
  Writes the content of DF to a .txt file in a directory.
  Each row will be written as a separate line in the text file

  Params:
    df (pl.DataFrame): DataFrame containing the content of a corpus
    filepath (str): Path to the .txt file
    query (bool): If True, write the content of query to a .txt file, else product_info to a .txt file

  Returns:
    None
  """

  col_to_write = "query_info" if query else "prod_info"

  if col_to_write not in df.columns:
    raise ValueError(f"Column {col_to_write} not found in DataFrame")

  with open(filepath, "w", encoding="utf-8") as f:
    for text in df[col_to_write]:
      f.write(text + '\n\n')

create_documents(product_info, "../Corpus/prod_corpus.txt", False)
create_documents(queries, "../Corpus/query_corpus.txt", True)