## Imports

In [1]:
import weaviate
from weaviate.classes.config import DataType, Property, Configure

In [2]:
client = weaviate.connect_to_local()
print(client.is_ready())

True


## The fields are 
id	gender	masterCategory	subCategory	articleType	baseColour	season	year	usage	productDisplayName	description	averageRating numberOfRatings	price

## Defining the collection / class / schema

In [3]:
print(client.collections.exists("CleanedProducts"))

True


In [32]:
if client.collections.exists("CleanedProducts"):
    client.collections.delete("CleanedProducts")


client.collections.create(
    name="CleanedProducts",
    properties=[
        Property(name="productDisplayName", data_type=DataType.TEXT, vectorize_property_name=True), 
        Property(name="season", data_type=DataType.TEXT, vectorize_property_name=True),
        Property(name="description", data_type=DataType.TEXT, vectorize_property_name=True),
        Property(name="gender", data_type=DataType.TEXT, vectorize_property_name=True),
        Property(name="masterCategory", data_type=DataType.TEXT, vectorize_property_name=True),
        Property(name="subCategory", data_type=DataType.TEXT, vectorize_property_name=True),
        Property(name="price", data_type=DataType.TEXT),
        Property(name="numberOfRatings", data_type=DataType.INT),
        Property(name="averageRating", data_type=DataType.NUMBER),
        Property(name="productId", data_type=DataType.INT),
        Property(name="articleType", data_type=DataType.TEXT, vectorize_property_name=True),
        Property(name="baseColour", data_type=DataType.TEXT, vectorize_property_name=True),
        Property(name="year", data_type=DataType.INT),
        Property(name="usage", data_type=DataType.TEXT, vectorize_property_name=True),
    ],
    vectorizer_config=[
        # Set a named vector
        Configure.NamedVectors.text2vec_transformers(  # Use the "text2vec-cohere" vectorizer
            name="name_master_sub_art_col_use_seas_gender", source_properties=["productDisplayName","masterCategory", "subCategory", "articleType", "baseColour", "usage", "season", 'gender']       # Set the source property(ies)
        ),
        Configure.NamedVectors.text2vec_transformers(  # Use the "text2vec-openai" vectorizer
            name="name_master_sub_col", source_properties=["productDisplayName","masterCategory", "subCategory", "baseColour"] # Set the source property(ies)
        ),
        Configure.NamedVectors.text2vec_transformers(  # Use the "text2vec-openai" vectorizer
            name= "name_color_seas", source_properties=["productDisplayName", "baseColour", "season"] # Set the source property(ies)
        ),
    ],
    rerank = True
)

<weaviate.collections.collection.Collection at 0x7053a1db4200>

## Importing and cleaning the data

In [33]:
import pandas as pd 

df = pd.read_csv(input("Enter the path to the csv file: "))

# include these fields id	gender	masterCategory	subCategory	articleType	baseColour	season	year	usage	productDisplayName	description	averageRating numberOfRatings	price
df = df[['productId', 'gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'year', 'usage', 'productDisplayName', 'description', 'averageRating', 'numberOfRatings', 'price']]

# print if na values are present
print(df.isna().sum())

# convert to dictionary
df_dict = df.to_dict(orient='records')
print(df_dict[:5])


productId             0
gender                0
masterCategory        0
subCategory           0
articleType           0
baseColour            0
season                0
year                  0
usage                 0
productDisplayName    0
description           0
averageRating         0
numberOfRatings       0
price                 0
dtype: int64
[{'productId': 15970, 'gender': 'Men', 'masterCategory': 'Apparel', 'subCategory': 'Topwear', 'articleType': 'Shirts', 'baseColour': 'Navy Blue', 'season': 'Fall', 'year': 2011.0, 'usage': 'Casual', 'productDisplayName': 'Turtle Check Men Navy Blue Shirt', 'description': 'Turtle Check Men Navy Blue Shirt is a Navy Blue Shirts for Men in the Fall season.', 'averageRating': 3.2, 'numberOfRatings': 909, 'price': '₹ 999'}, {'productId': 39386, 'gender': 'Men', 'masterCategory': 'Apparel', 'subCategory': 'Bottomwear', 'articleType': 'Jeans', 'baseColour': 'Blue', 'season': 'Summer', 'year': 2012.0, 'usage': 'Casual', 'productDisplayName': 'Peter En

In [34]:
collection = client.collections.get("CleanedProducts")

In [35]:
# write a function to insert data in batches of 1000, the dataset has about 31000 rows
def insert_data(data):
    try: 
        for i in range(0, len(data), 1000):
            print(f"Inserting data from {i} to {i+1000}")
            collection.data.insert_many(data[i:i+1000])
    except Exception as e:
        print(e)
    finally:
        print("Data Inserted Successfully!")

insert_data(df_dict)


Inserting data from 0 to 1000
Inserting data from 1000 to 2000
Inserting data from 2000 to 3000
Inserting data from 3000 to 4000
Inserting data from 4000 to 5000
Inserting data from 5000 to 6000
Inserting data from 6000 to 7000
Inserting data from 7000 to 8000
Inserting data from 8000 to 9000
Inserting data from 9000 to 10000
Inserting data from 10000 to 11000
Inserting data from 11000 to 12000
Inserting data from 12000 to 13000
Inserting data from 13000 to 14000
Inserting data from 14000 to 15000
Inserting data from 15000 to 16000
Inserting data from 16000 to 17000
Inserting data from 17000 to 18000
Inserting data from 18000 to 19000
Inserting data from 19000 to 20000
Inserting data from 20000 to 21000
Inserting data from 21000 to 22000
Inserting data from 22000 to 23000
Inserting data from 23000 to 24000
Inserting data from 24000 to 25000
Inserting data from 25000 to 26000
Inserting data from 26000 to 27000
Inserting data from 27000 to 28000
Inserting data from 28000 to 29000
Inserti

In [36]:
from weaviate.classes.query import MetadataQuery

response = collection.query.near_text(
    query = "black dress",
    limit = 10,
    target_vector = "name_master_sub_art_col_use_seas_gender",
    return_metadata=MetadataQuery(distance=True)
)