In [1]:
import src.constants as constants
import os
from typing import List

# Ingestion

## Set the global service context

In [2]:
from llama_index import ServiceContext, set_global_service_context
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding

embed_model = OpenAIEmbedding() # default is text-embedding-ada-002
llm = OpenAI(temperature=0, model="gpt-3.5-turbo", max_tokens=512)

service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model
)
set_global_service_context(service_context)

## Load documents

In [5]:
from llama_index import download_loader

MarkdownReader = download_loader("MarkdownReader")

loader = MarkdownReader()

documents: List = []
for doc_path in constants.DOCUMENT_PATHS:
    docs = loader.load_data(file=doc_path)
    # add source as metadata
    source = os.path.basename(doc_path)
    for doc in docs:
        doc.metadata = {"source": source}
        doc.excluded_llm_metadata_keys = ["source"] # we do not want this metadata for the synthesis atm
        doc.excluded_embed_metadata_keys = ["source"] # we don't want to embed this data
    documents.extend(docs)

In [6]:
documents

[Document(id_='ab0a9626-b34d-4067-96c9-05c84f166a61', embedding=None, metadata={'source': 'how-to-ecommerce-analytics.md'}, excluded_embed_metadata_keys=['source'], excluded_llm_metadata_keys=['source'], relationships={}, hash='b4aff25cf1a7c6479a024e65c02f1c5457412c69ab5e69fbf5c0433fd8a39617', text='\n\nEcommerce Analytics: How To Analyze Data For Your Business (2023)\nOf all the available tactics to grow a business, ecommerce analytics is taking the lead.\xa0\n\nWhile most companies praise ecommerce data as key, the truth is that\xa080% of marketing executives\xa0struggle to make data-driven decisions, despite all the information at their fingertips.\xa0\n\nThis guide will get you comfortable with the language of analytics and help you start tracking the right\xa0ecommerce metrics. That way, you can understand customers’ actions, better serve them, and increase sales.\xa0\n\nWhat is ecommerce analytics?\n----------------------------\n\nEcommerce analytics is the process of discovering

In [7]:
len(documents)

96

In [9]:
documents[0:3]

[Document(id_='ab0a9626-b34d-4067-96c9-05c84f166a61', embedding=None, metadata={'source': 'how-to-ecommerce-analytics.md'}, excluded_embed_metadata_keys=['source'], excluded_llm_metadata_keys=['source'], relationships={}, hash='b4aff25cf1a7c6479a024e65c02f1c5457412c69ab5e69fbf5c0433fd8a39617', text='\n\nEcommerce Analytics: How To Analyze Data For Your Business (2023)\nOf all the available tactics to grow a business, ecommerce analytics is taking the lead.\xa0\n\nWhile most companies praise ecommerce data as key, the truth is that\xa080% of marketing executives\xa0struggle to make data-driven decisions, despite all the information at their fingertips.\xa0\n\nThis guide will get you comfortable with the language of analytics and help you start tracking the right\xa0ecommerce metrics. That way, you can understand customers’ actions, better serve them, and increase sales.\xa0\n\nWhat is ecommerce analytics?\n----------------------------\n\nEcommerce analytics is the process of discovering

### Custom Loader

`MarkdownReader` already applies a chunking strategy based on the Markdown language.

In [28]:
"""Custom Markdown Reader.

A parser for md files.

"""
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, cast

from llama_index.readers.base import BaseReader
from llama_index.schema import Document

class MarkdownReader(BaseReader):
    """Markdown parser.

    Extract text from markdown files.
    Returns dictionary with keys as headers and values as the text between headers.

    """

    def __init__(
        self,
        *args: Any,
        remove_hyperlinks: bool = True,
        remove_images: bool = True,
        **kwargs: Any,
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)
        self._remove_hyperlinks = remove_hyperlinks
        self._remove_images = remove_images

    def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
        """Convert a markdown file to a dictionary.

        The keys are the headers and the values are the text under each header.

        """
        markdown_tups: List[Tuple[Optional[str], str]] = []
        lines = markdown_text.split("\n")
        
        current_header = None
        current_text = ""
        inside_code_block = False

        for line in lines:
            if line.startswith("```"): # ! I DO NOT want to split code blocks
                inside_code_block = not inside_code_block

            if not inside_code_block:
                header_match = re.match(r"^#+\s", line)
                if header_match:
                    if current_header is not None:
                        if current_text.strip() != "":
                            markdown_tups.append((current_header, current_text))

                    current_header = line
                    current_text = ""
                else:
                    current_text += line + "\n"
            else:
                current_text += line + "\n"

        if current_text.strip() != "":
            markdown_tups.append((current_header, current_text))

        if current_header is not None:
            markdown_tups = [
                (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
                for key, value in markdown_tups
            ]
        else:
            markdown_tups = [
                (key, re.sub("<.*?>", "", value)) for key, value in markdown_tups
            ]

        return markdown_tups

    def remove_images(self, content: str) -> str:
        """Get a dictionary of a markdown file from its path."""
        pattern = r"!{1}\[\[(.*)\]\]"
        content = re.sub(pattern, "", content)
        return content

    def remove_hyperlinks(self, content: str) -> str:
        """Get a dictionary of a markdown file from its path."""
        pattern = r"\[(.*?)\]\((.*?)\)"
        content = re.sub(pattern, r"\1", content)
        return content
    
    def parse_tups(
        self, filepath: Path, content: Optional[str] = None, errors: str = "ignore"
    ) -> List[Tuple[Optional[str], str]]:
        """Parse file into tuples.
        If content is provided, use that instead of reading from file."""
        if content is None:
            with open(filepath, "r", encoding="utf-8") as f:
                content = f.read()
        if self._remove_hyperlinks:
            content = self.remove_hyperlinks(content)
        if self._remove_images:
            content = self.remove_images(content)
        markdown_tups = self.markdown_to_tups(content)
        return markdown_tups

    def load_data(
        self,
        file: Path,
        extra_info: Optional[Dict] = None,
        content: Optional[str] = None,
    ) -> List[Document]:
        """Parse file into string.
        If content is provided, use that instead of reading from file."""
        tups = self.parse_tups(file, content=content)
        results = []
        # TODO: don't include headers right now
        results = [
            Document(
                text=f"\n\n{header}\n{value}" if header else value, 
                metadata=extra_info or {}
            ) 
            for header, value in tups
        ]
        return results

In [27]:
reader = MarkdownReader()
reader.load_data(constants.DOCUMENT_PATHS[0])

[Document(id_='9191e427-18d5-491a-86a2-dec2d6b34173', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='b4aff25cf1a7c6479a024e65c02f1c5457412c69ab5e69fbf5c0433fd8a39617', text='\n\nEcommerce Analytics: How To Analyze Data For Your Business (2023)\nOf all the available tactics to grow a business, ecommerce analytics is taking the lead.\xa0\n\nWhile most companies praise ecommerce data as key, the truth is that\xa080% of marketing executives\xa0struggle to make data-driven decisions, despite all the information at their fingertips.\xa0\n\nThis guide will get you comfortable with the language of analytics and help you start tracking the right\xa0ecommerce metrics. That way, you can understand customers’ actions, better serve them, and increase sales.\xa0\n\nWhat is ecommerce analytics?\n----------------------------\n\nEcommerce analytics is the process of discovering, interpreting, and communicating data patterns related t

We could combine MD logic and chunk size inside the reader ??

## Parse documents

A `Document` can be divided into `Nodes` or chunks. We can use the `SimpleNodeParser` class, but we can also create nodes manually.

In [12]:
from llama_index.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(
    separator=" ",
    chunk_size=256,
    chunk_overlap=20
)

### Metadata Extraction

We can use an LLM to extract metadata for each node.

In [13]:
from llama_index.node_parser.extractors import (
    MetadataExtractor,
    TitleExtractor,
    SummaryExtractor,
    KeywordExtractor
)


metadata_extractor = MetadataExtractor(
    extractors=[
        TitleExtractor(nodes=5),
        SummaryExtractor(),
        KeywordExtractor(keywords=5),
    ]
)

```<metadata 1 - title>
<metadata 2 - summary>

<text chunk>```

->

embedding

--

query -> emb -> to find similar emd ->


----


doc
    - tags: analytics, ecommerce
    - ..

```<metadata 1 - title>
<metadata 2 - summary>

<text chunk>```

emb
--

query -> extraction -> query + tags -> semantic search + metadata filtering
                                    -> document filtering -> semantic search on those docs/chunks

eval data for retrieval

.md -> create questions/query -> relevant chunks -> eval dataset

-> hit rate: doc
-> OSS: metrics based on cosine similarity

### Parser

In [14]:
from llama_index.node_parser import SimpleNodeParser

node_parser = SimpleNodeParser.from_defaults(
    text_splitter=text_splitter,
    metadata_extractor=metadata_extractor,
    include_prev_next_rel=True,
    include_metadata=True,
)

nodes = node_parser.get_nodes_from_documents(documents=documents)

  from .autonotebook import tqdm as notebook_tqdm
Extracting summaries: 100%|██████████| 152/152 [02:59<00:00,  1.18s/it]


In [17]:
len(nodes)

152

In [18]:
print(nodes[0].get_content(metadata_mode="all"))

[Excerpt from document]
source: how-to-ecommerce-analytics.md
document_title: Mastering Ecommerce Analytics: A Comprehensive Guide to Data-Driven Decision Making for Business Growth and Success
section_summary: This section introduces the topic of ecommerce analytics and its importance in making data-driven decisions for business growth. It highlights the struggle that many marketing executives face in utilizing data effectively. The section defines ecommerce analytics as the process of discovering, interpreting, and communicating data patterns related to online business. It emphasizes the role of analytics in measuring customer behavior, performance trends, and ROI. The section also mentions the use of analytics by marketers to show ROI for campaigns and make better decisions to increase sales and reduce costs. It briefly mentions the role of ecommerce analytics in centralizing and managing data.
excerpt_keywords: ecommerce analytics, data-driven decisions, customer behavior, performa

In [19]:
print(nodes[100].get_content(metadata_mode="all"))

[Excerpt from document]
source: how-to-customer-segmentation.md
document_title: Mastering Ecommerce Analytics: A Comprehensive Guide to Data-Driven Decision Making for Business Growth and Success
section_summary: The key topics of this section are the four types of customer segmentation: demographic segmentation, psychographic segmentation, behavioral segmentation, and geographic segmentation. The key entities mentioned are the characteristics used for demographic segmentation (age, gender, income, occupation, education level, marital status, and location), the factors considered for psychographic segmentation (lifestyle, interests, values, and attitudes), the factors used for behavioral segmentation (purchase history, usage patterns, brand loyalty, and response to marketing campaigns), and the factors considered for geographic segmentation (location, such as country, region, city, or neighborhood).
excerpt_keywords: customer segmentation, demographic segmentation, psychographic segmen

## [Storage](https://gpt-index.readthedocs.io/en/latest/core_modules/data_modules/storage/root.html)

- Document stores: where ingested documents (i.e., Node objects) are stored,
- Index stores: where index metadata are stored,
- Vector stores: where embedding vectors are stored.
- Graph stores: where knowledge graphs are stored (i.e. for KnowledgeGraphIndex).

## Vectorstore

We are gonna use Weaviate as our vector store to store embeddings and metadata.

In [15]:
import os
import weaviate
from llama_index.vector_stores import WeaviateVectorStore

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

# weaviate client
WEAVIATE_CLIENT = weaviate.Client(
    url = "http://localhost:8080",
    additional_headers = {
        "X-OpenAI-Api-Key": OPENAI_API_KEY
    }
)

VECTOR_STORE = WeaviateVectorStore(
    weaviate_client=WEAVIATE_CLIENT,
    index_name="ShopifyHowToGuides",
)

In [16]:
type(VECTOR_STORE)

llama_index.vector_stores.weaviate.WeaviateVectorStore

In [3]:
print(VECTOR_STORE)

stores_text=True is_embedding_query=True index_name='ShopifyHowToGuides' url=None text_key='text' auth_config={} client_kwargs={}


## Construct the index

In [20]:
from llama_index.storage.storage_context import StorageContext
from llama_index import VectorStoreIndex

storage_context = StorageContext.from_defaults(vector_store=VECTOR_STORE)
index = VectorStoreIndex(nodes=nodes, storage_context=storage_context)

_____

# Querying

In [21]:
# load index
index = VectorStoreIndex.from_vector_store(VECTOR_STORE)

## High-Level API

In [22]:
query_engine = index.as_query_engine()

/Users/bernardo/Library/Caches/pypoetry/virtualenvs/shopify-csr-chatbot-62SOqhE9-py3.11/lib/python3.11/site-packages/pydantic/_internal/_config.py:219: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.3/migration/


In [23]:
response = query_engine.query("Can you use analytics in ecommerce?")

In [24]:
response

Response(response="Yes, analytics can be used in ecommerce. Analytics platforms allow businesses to uncover trends and patterns in their data, helping them understand how their business is performing in the present and future. By using marketing analytics, businesses can condense data and make it visible in a short amount of time, gaining insights such as the number of website visitors, visitor actions, popular pages during busy shopping seasons, and the devices used by visitors. Ecommerce analytics tools, such as Shopify's reporting and analytics, can be used to improve a store's performance by providing pre-built dashboards and reports to spot trends, capitalize on opportunities, and make more informed decisions.", source_nodes=[NodeWithScore(node=TextNode(id_='9b1d08aa-12fc-48e3-8251-069a505be71f', embedding=[-0.017925793, 0.008153698, 0.024234796, -0.04087136, -0.007886252, 0.019434467, -0.023137577, -0.019557904, -0.010382422, -0.04238003, 0.01830982, 0.02359018, -0.028363077, -0.

In [25]:
print(response)

Yes, analytics can be used in ecommerce. Analytics platforms allow businesses to uncover trends and patterns in their data, helping them understand how their business is performing in the present and future. By using marketing analytics, businesses can condense data and make it visible in a short amount of time, gaining insights such as the number of website visitors, visitor actions, popular pages during busy shopping seasons, and the devices used by visitors. Ecommerce analytics tools, such as Shopify's reporting and analytics, can be used to improve a store's performance by providing pre-built dashboards and reports to spot trends, capitalize on opportunities, and make more informed decisions.


In [26]:
print(response.get_formatted_sources())

> Source (Doc id: 9b1d08aa-12fc-48e3-8251-069a505be71f): Uncover trends

Modern ecommerce data analytics platforms treat your data as an interconnected sy...

> Source (Doc id: e9c03c2d-b392-4e5b-8b7f-9068ef6f2bea): data to focus solely on a successful winter jacket sales week, ignoring the overall decline throu...


> Note metadata gets concatenated with the chunk.

[chat engine](https://gpt-index.readthedocs.io/en/latest/core_modules/query_modules/chat_engines/usage_pattern.html)

- Can you use analytics in ecommerce?
- How do you use analytics to boost business growth?

In [27]:
# stateful analogy of query engine
chat_engine = index.as_chat_engine(
    chat_mode="condense_question",
    verbose=True
)

In [29]:
end: str = ""
while not end:
    response = chat_engine.chat(input("User:"))
    print(response)
    print("Sources:")
    print(response.source_nodes)
    # print(response.get_formatted_sources())
    print("*"*30)
    end = input("Input something to stop")

Querying with: How can analytics be used to boost business growth in ecommerce?
Analytics can be used to boost business growth in ecommerce by providing valuable insights into customer behavior, performance trends, and return on investment (ROI). By analyzing data patterns related to online business, marketers can make data-driven decisions to increase sales, reduce costs, and make improvements to their business. Analytics can help marketers show the ROI for their campaigns, identify trends and opportunities, and make more informed decisions to drive business growth. Additionally, ecommerce analytics can centralize and manage data, allowing businesses to track the right metrics and better understand their customers' actions, ultimately leading to better service and increased sales.
Sources:
[NodeWithScore(node=TextNode(id_='ad4e8d3c-555e-4c9c-85ab-01f592488784', embedding=[-0.013433273, -0.010810208, 0.013790963, -0.043691255, -0.020348627, 0.021196486, -0.019646492, -0.012134988, -0.0

In [66]:
chat_engine.reset()

## Lower-level API: Customization

In [30]:
from llama_index import VectorStoreIndex, get_response_synthesizer

from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import SimilarityPostprocessor, KeywordNodePostprocessor

In [33]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=5,
    vector_store_query_mode="default", # default, hybrid, sparse, text search
)

In [44]:
retriever.retrieve("Ecommerce and analytics")[4].node.relationships

{<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='5b15416d-dc93-4ebe-aae5-a41171cf8736', node_type=None, metadata={'source': 'how-to-ecommerce-analytics.md'}, hash='a5d3522c2e842e3457a5e1113942915fb06a1f892c443df38990fcb39fa1b84e'),
 <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='b9135f52-1827-4b65-9c86-f69e54b374e9', node_type=None, metadata={'source': 'how-to-ecommerce-analytics.md'}, hash='ba7bb4275d17782db78d812b54d230d605cb086f7007e7dbf4df2b5a75f15bf7')}

In [46]:
response_synthesizer = get_response_synthesizer()

In [47]:
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.8)
    ]
)

In [48]:
response = query_engine.query("Can you use analytics in ecommerce?")

In [49]:
print(response)

Yes, analytics can be used in ecommerce. Modern ecommerce data analytics platforms allow businesses to uncover trends and patterns in their data, helping them understand how their business is performing in the present and future. Analytics can condense data and make it visible in a short amount of time, providing insights such as the number of website visitors, visitor actions, popular pages during busy shopping seasons, and the devices used by visitors. By leveraging ecommerce analytics, businesses can make more informed decisions, spot trends, capitalize on opportunities, and improve their bottom line.


In [50]:
print(response.get_formatted_sources())

> Source (Doc id: 9b1d08aa-12fc-48e3-8251-069a505be71f): Uncover trends

Modern ecommerce data analytics platforms treat your data as an interconnected sy...

> Source (Doc id: e9c03c2d-b392-4e5b-8b7f-9068ef6f2bea): data to focus solely on a successful winter jacket sales week, ignoring the overall decline throu...

> Source (Doc id: 21816626-9b5a-4736-81c5-c958c6a51270): Uncover trends

Modern ecommerce data analytics platforms treat your data as an interconnected sy...

> Source (Doc id: d935fddd-d8e5-4103-b368-bbfc3fdd481a): data to focus solely on a successful winter jacket sales week, ignoring the overall decline throu...

> Source (Doc id: 8f102a43-c117-43e9-b770-60fdc14bce1e): Get Ecommerce Analytics for Beginners delivered right to your inbox.

Almost there: please enter ...


In [51]:
response.source_nodes[0].score, response.source_nodes[1].score

(0.87331933, 0.87177128)