In [None]:
%pip install llama-index-core
%pip install llama-index-llms-openai
%pip install llama-index-embeddings-openai
%pip install llama-index-postprocessor-colbert-rerank
%pip install llama-index-readers-web

Collecting llama-index-core
  Downloading llama_index_core-0.10.28-py3-none-any.whl (15.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json (from llama-index-core)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting deprecated>=1.2.9.3 (from llama-index-core)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting dirtyjson<2.0.0,>=1.0.8 (from llama-index-core)
  Downloading dirtyjson-1.0.8-py3-none-any.whl (25 kB)
Collecting httpx (from llama-index-core)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llamaindex-py-client<0.2.0,>=0.1.16 (from llama-index-core)
  Downloading llamaindex_py_client-0.1.17-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m

In [None]:
import os

In [None]:
import re
import pandas as pd

def extract_reviews_to_df(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    # Regular expression pattern to match review blocks
    pattern = r"Review title: (.*?)\nRating: (\d\.\d)\nWhat they liked about the product: (.*?)\nWhat they disliked about the product: (.*?)\n\n"

    # Find all matches in the content
    matches = re.findall(pattern, content, re.DOTALL)

    # Process matches to create a list of dictionaries
    reviews = []
    for match in matches:
        title, rating, liked, disliked = match
        reviews.append({
            'title': title.strip(),
            'rating': float(rating),
            'liked': liked.strip(),
            'disliked': disliked.strip()
        })

    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(reviews)

    return df

# Example usage
file_path = './extracted_reviews.txt'
reviews_df = extract_reviews_to_df(file_path)
print(reviews_df)


                                                 title  rating  \
0                G2 helps market and customer validity     4.5   
1    Recommended for gaining visibility in B2B Mark...     4.5   
2                                          Credibility     4.5   
3    Advocately have been instrumental to our revie...     5.0   
4                                         Game Changer     5.0   
..                                                 ...     ...   
757                             Actionable Intent Data     4.5   
758                 Best Analysis model to build trust     4.5   
759      G2 is an essential tool for Procurement teams     5.0   
760  This company gets a 10 our of 10 for customer ...     5.0   
761          We use g2 to drive traffic to our website     4.0   

                                                 liked  \
0    G2 has helped our customers publicly validate ...   
1    It showcases a wide range of vendors with unbi...   
2    Great way to build our brand

In [None]:
reviews_df.head()

Unnamed: 0,title,rating,liked,disliked
0,G2 helps market and customer validity,4.5,G2 has helped our customers publicly validate ...,Not a lot to dislike. Be great as G2 becomes ...
1,Recommended for gaining visibility in B2B Mark...,4.5,It showcases a wide range of vendors with unbi...,There aren't enough people open to reviewing p...
2,Credibility,4.5,"Great way to build our brand in a new space, b...","So far so good, we are just starting to use ad..."
3,Advocately have been instrumental to our revie...,5.0,The end to end service and process has been th...,There is little that I dislike. Understanding ...
4,Game Changer,5.0,Advocately facilitates a continuous flow of gr...,"This is not a true dislike, but the success of..."


In [None]:
reviews_df.tail()

Unnamed: 0,title,rating,liked,disliked
757,Actionable Intent Data,4.5,I like how easy it has been to implement G2 Ma...,The only thing I'd like to update is the creat...
758,Best Analysis model to build trust,4.5,The most helpful about G2 Marketing Solution i...,When you are positioned next to American giant...
759,G2 is an essential tool for Procurement teams,5.0,G2 Marketing Solutions is an essential tool fo...,Nothing comes to mind. I really appreciate th...
760,This company gets a 10 our of 10 for customer ...,5.0,"Everyone I speak to, thinking to Blake, Stepha...","I wouldn't mention this, except being asked to..."
761,We use g2 to drive traffic to our website,4.0,We could convert a few clients from g2 listing,"So many features are paid, and also the softwa..."


In [None]:
reviews_df.to_csv("reviews.csv")

In [None]:
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

llama_documents = []

for index, row in reviews_df.iterrows():
    # Combine 'liked' and 'disliked' into a single text field
    content = f"Liked: {row['liked']}\nDisliked: {row['disliked']}"

    llama_document = Document(
        text=content,
        metadata={"title": row['title'], "rating": row['rating']},
        excluded_llm_metadata_keys=["title", "rating"],
        metadata_template="{key}=>{value}",
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
    )
    llama_documents.append(llama_document)


In [None]:
from llama_index.core import PromptTemplate

# Define your custom prompt template
MY_CUSTOM_PROMPT_TMPL = ("""
    You are an assistant for a company called G2 and you are given access to a lot of customer reviews.
    These reviews help both buyers and software vendors in decision-making. One interesting aspect of the review data that we want to solve is to list the exact feature sets the customers are looking for.
    A few examples include application performance, the overall user experience, missing functionality, bugs, etc.
    ---------------------\n
    Given this information, please answer the question: {query_str}\n"""
)

# Create a new instance of PromptTemplate with your custom template
my_custom_prompt = PromptTemplate(MY_CUSTOM_PROMPT_TMPL)

# Example usage of the custom prompt template
# Assuming you have a context string and a query string
query_str = "Give me upto 10 possible feature sets that would benefit my company based on all the reviews."

# Format the prompt using the custom template
prompt = my_custom_prompt.format(query_str=query_str)

print(prompt)


    You are an assistant for a company called G2 and you are given access to a lot of customer reviews.
    These reviews help both buyers and software vendors in decision-making. One interesting aspect of the review data that we want to solve is to list the exact feature sets the customers are looking for.
    A few examples include application performance, the overall user experience, missing functionality, bugs, etc.
    ---------------------

    Given this information, please answer the question: Give me upto 10 possible feature sets that would benefit my company based on all the reviews.



In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding

index = VectorStoreIndex.from_documents(
    llama_documents,
    embed_model=OpenAIEmbedding(),
)

In [None]:
from llama_index.core.query_pipeline import (
    QueryPipeline,
    InputComponent,
    ArgPackComponent,
)
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.openai import OpenAI
from llama_index.postprocessor.colbert_rerank import ColbertRerank

# First, we create an input component to capture the user query
input_component = InputComponent()

# Next, we use the LLM to rewrite a user query
rewrite = (
    "Please write a query to a semantic search engine using the current conversation.\n"
    "\n"
    "\n"
    "{chat_history_str}"
    "\n"
    "\n"
    "Latest message: {query_str}\n"
    'Query:"""\n'
)
rewrite_template = PromptTemplate(rewrite)
llm = OpenAI(
    model="gpt-4",
    temperature=0.2,
)

# we will retrieve two times, so we need to pack the retrieved nodes into a single list
argpack_component = ArgPackComponent()

# using that, we will retrieve...
retriever = index.as_retriever(similarity_top_k=6)

# then postprocess/rerank with Colbert
reranker = ColbertRerank(top_n=3)

In [None]:
# then lastly, we need to create a response using the nodes AND chat history
from typing import Any, Dict, List, Optional
from llama_index.core.bridge.pydantic import Field
from llama_index.core.llms import ChatMessage
from llama_index.core.query_pipeline import CustomQueryComponent
from llama_index.core.schema import NodeWithScore

# DEFAULT_CONTEXT_PROMPT = (
#     "Here is some context that may be relevant:\n"
#     "-----\n"
#     "{node_context}\n"
#     "-----\n"
#     "Please write a response to the following question, using the above context:\n"
#     "{query_str}\n"
# )


class ResponseWithChatHistory(CustomQueryComponent):
    llm: OpenAI = Field(..., description="OpenAI LLM")
    system_prompt: Optional[str] = Field(
        default=None, description="System prompt to use for the LLM"
    )
    context_prompt: str = Field(
        default=MY_CUSTOM_PROMPT_TMPL,
        description="Context prompt to use for the LLM",
    )

    def _validate_component_inputs(
        self, input: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Validate component inputs during run_component."""
        # NOTE: this is OPTIONAL but we show you where to do validation as an example
        return input

    @property
    def _input_keys(self) -> set:
        """Input keys dict."""
        # NOTE: These are required inputs. If you have optional inputs please override
        # `optional_input_keys_dict`
        return {"chat_history", "nodes", "query_str"}

    @property
    def _output_keys(self) -> set:
        return {"response"}

    def _prepare_context(
        self,
        chat_history: List[ChatMessage],
        nodes: List[NodeWithScore],
        query_str: str,
    ) -> List[ChatMessage]:
        node_context = ""
        for idx, node in enumerate(nodes):
            node_text = node.get_content(metadata_mode="llm")
            node_context += f"Context Chunk {idx}:\n{node_text}\n\n"

        formatted_context = self.context_prompt.format(
            node_context=node_context, query_str=query_str
        )
        user_message = ChatMessage(role="user", content=formatted_context)

        chat_history.append(user_message)

        if self.system_prompt is not None:
            chat_history = [
                ChatMessage(role="system", content=self.system_prompt)
            ] + chat_history

        return chat_history

    def _run_component(self, **kwargs) -> Dict[str, Any]:
        """Run the component."""
        chat_history = kwargs["chat_history"]
        nodes = kwargs["nodes"]
        query_str = kwargs["query_str"]

        prepared_context = self._prepare_context(
            chat_history, nodes, query_str
        )

        response = llm.chat(prepared_context)

        return {"response": response}

    async def _arun_component(self, **kwargs: Any) -> Dict[str, Any]:
        """Run the component asynchronously."""
        # NOTE: Optional, but async LLM calls are easy to implement
        chat_history = kwargs["chat_history"]
        nodes = kwargs["nodes"]
        query_str = kwargs["query_str"]

        prepared_context = self._prepare_context(
            chat_history, nodes, query_str
        )

        response = await llm.achat(prepared_context)

        return {"response": response}


response_component = ResponseWithChatHistory(
    llm=llm,
    system_prompt=(
        "You are a Q&A system. You will be provided with the previous chat history, "
        "as well as possibly relevant context, to assist in answering a user message."
    ),
)

In [None]:
pipeline = QueryPipeline(
    modules={
        "input": input_component,
        "rewrite_template": rewrite_template,
        "llm": llm,
        "rewrite_retriever": retriever,
        "query_retriever": retriever,
        "join": argpack_component,
        "reranker": reranker,
        "response_component": response_component,
    },
    verbose=False,
)

# run both retrievers -- once with the hallucinated query, once with the real query
pipeline.add_link(
    "input", "rewrite_template", src_key="query_str", dest_key="query_str"
)
pipeline.add_link(
    "input",
    "rewrite_template",
    src_key="chat_history_str",
    dest_key="chat_history_str",
)
pipeline.add_link("rewrite_template", "llm")
pipeline.add_link("llm", "rewrite_retriever")
pipeline.add_link("input", "query_retriever", src_key="query_str")

# each input to the argpack component needs a dest key -- it can be anything
# then, the argpack component will pack all the inputs into a single list
pipeline.add_link("rewrite_retriever", "join", dest_key="rewrite_nodes")
pipeline.add_link("query_retriever", "join", dest_key="query_nodes")

# reranker needs the packed nodes and the query string
pipeline.add_link("join", "reranker", dest_key="nodes")
pipeline.add_link(
    "input", "reranker", src_key="query_str", dest_key="query_str"
)

# synthesizer needs the reranked nodes and query str
pipeline.add_link("reranker", "response_component", dest_key="nodes")
pipeline.add_link(
    "input", "response_component", src_key="query_str", dest_key="query_str"
)
pipeline.add_link(
    "input",
    "response_component",
    src_key="chat_history",
    dest_key="chat_history",
)

In [None]:
from llama_index.core.memory import ChatMemoryBuffer

pipeline_memory = ChatMemoryBuffer.from_defaults(token_limit=8000)

In [None]:
user_inputs = [
    # "Give me upto 20 possible feature sets that would benefit my company. some example feature sets for your reference are : Customer Satisfaction, Comparative Analysis, Pricing, Ease of Setup, Support Quality, Product Benefits"
    """Give me upto 20 possible feature sets that would benefit my company based on the meaning of the reviews in the vector dataset. The features should be based on the most common themes in the reviews.
Here are a few examples of the categories:
Customer Satisfaction
Comparative Analysis
Pricing
Ease of Setup
Support Quality

Evaluate and understand all reviews in the dataset and provide me with the most common themes in the reviews. Based on these themes, suggest up to 20 feature sets that would benefit my company.
I want to identify the categories for a review, and a category would be potentially what a user would search for to get the product based on the content of the review.
"""
]

for msg in user_inputs:
    # get memory
    chat_history = pipeline_memory.get()

    # prepare inputs
    chat_history_str = "\n".join([str(x) for x in chat_history])

    # run pipeline
    response = pipeline.run(
        query_str=msg,
        chat_history=chat_history,
        chat_history_str=chat_history_str,
    )

    # update memory
    user_msg = ChatMessage(role="user", content=msg)
    pipeline_memory.put(user_msg)
    print(str(user_msg))

    pipeline_memory.put(response.message)
    print(str(response.message))
    print()

user: Give me upto 20 possible feature sets that would benefit my company. some example feature sets for your reference are : Customer Satisfaction, Comparative Analysis, Pricing, Ease of Setup, Support Quality, Product Benefits
assistant: 1. Application Performance: The speed and responsiveness of your software can greatly impact user satisfaction. Ensuring your software performs well under various conditions is crucial.

2. User Experience: A simple, intuitive interface can significantly improve the user experience, making it easier for customers to navigate and use your software.

3. Missing Functionality: Your software should be able to perform all the tasks that your customers need. Any missing functionality can lead to customer dissatisfaction.

4. Bug-Free Software: Bugs and glitches can disrupt the user experience and cause frustration. Regular testing and updates can help keep your software bug-free.

5. Customer Satisfaction: This involves understanding and meeting customer e

In [None]:
feature_sets = ['Application Performance', 'User Experience', 'Missing Functionality', 'Bug Fixes', 'Customer Satisfaction', 'Comparative Analysis', 'Pricing', 'Ease of Setup', 'Support Quality', 'Product Benefits','Security Features','Customization Options','Integration Capabilities','Scalability','Mobile Accessibility','Multi-Language Support','Data Analysis Tools','Collaboration Features','Training and Documentation','Automation Capabilities']
feature_sets[0]

'Application Performance'

In [None]:
reviews_df.head()

Unnamed: 0,title,rating,liked,disliked
0,G2 helps market and customer validity,4.5,G2 has helped our customers publicly validate ...,Not a lot to dislike. Be great as G2 becomes ...
1,Recommended for gaining visibility in B2B Mark...,4.5,It showcases a wide range of vendors with unbi...,There aren't enough people open to reviewing p...
2,Credibility,4.5,"Great way to build our brand in a new space, b...","So far so good, we are just starting to use ad..."
3,Advocately have been instrumental to our revie...,5.0,The end to end service and process has been th...,There is little that I dislike. Understanding ...
4,Game Changer,5.0,Advocately facilitates a continuous flow of gr...,"This is not a true dislike, but the success of..."


In [None]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.17.0
    Uninstalling openai-1.17.0:
      Successfully uninstalled openai-1.17.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llama-index-core 0.10.28 requires openai>=1.1.0, but you have openai 0.28.0 which is incompatible.[0m[31m
[0mSuccessfully installed openai-0.28.0


In [None]:
import openai
openai.__version__

'0.28.0'

In [None]:
reviews_df.drop(["text"],axis=1,inplace=True)
reviews_df.head()

In [None]:
sample = reviews_df[:50]
sample.tail()

Unnamed: 0,title,rating,liked,disliked
45,"Incredibly simple, but powerful tool to acquir...",5.0,"Based on SatisMeter NPS, Advocately absolutely...",I have nothing to say here. We are wishing goo...
46,Great vendor!,4.0,G2 is a great sales and marketing resource for...,"At least for the program we are running, G2 re..."
47,Great platform!,5.0,I enjoy the ease of logging in and seeing how ...,I wish to gain more visibility or training on ...
48,Love Working with the G2 Crowd Team,5.0,I love being able to reach out to Walter with ...,Because G2 Crowd offers so many reports I wish...
49,"Some great behind the scenes tools, as G2 Crow...",3.5,Kara DeWalt has been a friendly and responsive...,We're trying out their sponsored content marke...


In [None]:
reviews_df.drop(["predicted_categories","text"],axis=1,inplace=True)
reviews_df.head()

Unnamed: 0,title,rating,liked,disliked
0,G2 helps market and customer validity,4.5,G2 has helped our customers publicly validate ...,Not a lot to dislike. Be great as G2 becomes ...
1,Recommended for gaining visibility in B2B Mark...,4.5,It showcases a wide range of vendors with unbi...,There aren't enough people open to reviewing p...
2,Credibility,4.5,"Great way to build our brand in a new space, b...","So far so good, we are just starting to use ad..."
3,Advocately have been instrumental to our revie...,5.0,The end to end service and process has been th...,There is little that I dislike. Understanding ...
4,Game Changer,5.0,Advocately facilitates a continuous flow of gr...,"This is not a true dislike, but the success of..."


In [None]:
import openai
import pandas as pd
from tqdm import tqdm

# Set up the OpenAI API credentials

# Define the list of feature sets
# feature_sets = [
#     "Application Performance", "Functionality", "Positive",
#     "Bug Fixes", "Integration Capabilities", "Customizability", "Security Features",
#     "Scalability", "Training and Support", "Cost-Effectiveness"
# ]

data = reviews_df

data["text"] = data["title"] + " " + data["liked"] + " " + data["disliked"]

prompt_template = """
The following text is a product review:
"{text}"

Based on the review, classify it into only one category or a maximum of two categories which it is strongly related to based on its semantic meaning, don't just assign a loosely related category to a review.:
{category_list}

The categories for this review are:
"""

predicted_categories = []

for index, row in tqdm(data.iterrows(), total=len(data)):
    text = row["text"]
    category_list = "\n".join([f"- {category}" for category in feature_sets])
    prompt = prompt_template.format(text=text, category_list=category_list)

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an AI assistant tasked with classifying product reviews into strictly 2 categories."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=256,
        n=1,
        stop=None,
        temperature=0.9,
    )

    categories = response.choices[0].message.content.strip().split(",")
    categories = [category.strip() for category in categories]
    predicted_categories.append(categories)

data["predicted_categories"] = predicted_categories

data.to_csv("data_with_categories.csv", index=False)

100%|██████████| 762/762 [09:39<00:00,  1.32it/s]


In [None]:
data.tail()

Unnamed: 0,title,rating,liked,disliked,text,predicted_categories
757,Actionable Intent Data,4.5,I like how easy it has been to implement G2 Ma...,The only thing I'd like to update is the creat...,Actionable Intent Data I like how easy it has ...,[- User Experience\n- Integration Capabilities]
758,Best Analysis model to build trust,4.5,The most helpful about G2 Marketing Solution i...,When you are positioned next to American giant...,Best Analysis model to build trust The most he...,"[User Experience, Comparative Analysis]"
759,G2 is an essential tool for Procurement teams,5.0,G2 Marketing Solutions is an essential tool fo...,Nothing comes to mind. I really appreciate th...,G2 is an essential tool for Procurement teams ...,"[Comparative Analysis, Product Benefits]"
760,This company gets a 10 our of 10 for customer ...,5.0,"Everyone I speak to, thinking to Blake, Stepha...","I wouldn't mention this, except being asked to...",This company gets a 10 our of 10 for customer ...,[- Customer Satisfaction\n- Support Quality]
761,We use g2 to drive traffic to our website,4.0,We could convert a few clients from g2 listing,"So many features are paid, and also the softwa...",We use g2 to drive traffic to our website We c...,[1. Comparative Analysis\n2. Pricing]


In [None]:
from IPython.display import Markdown
Markdown(data['text'][47])

Great platform! I enjoy the ease of logging in and seeing how the pages I own are performing and doing. Navigating is straightforward. I wish to gain more visibility or training on how to utilize the features/tools G2 has put in place.

In [None]:
data["predicted_categories"][47]

['User Experience', 'Training and Documentation']

In [None]:
print(data["text"][761])
print(data["predicted_categories"][761])

We use g2 to drive traffic to our website We could convert a few clients from g2 listing So many features are paid, and also the software which is paying more will be listed higher in the list regardless of the reviews count.
['1. Comparative Analysis\n2. Pricing']
