In [1]:
!pip install datasets -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 w

In [40]:
# Housekeeping
!rm tripadvisor.db

# Getting the dataset

In [3]:
from datasets import load_dataset

ds = load_dataset("argilla/tripadvisor-hotel-reviews")

Downloading readme:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20491 [00:00<?, ? examples/s]

In [4]:
sample_data = ds["train"]["text"][5]

In [5]:
sample_data

"best hotel stayed picked hotel casablanca based recommendations tripadvisor not disappointed, location good staff extremely friendly helpful rooms reasonable size clean equipped peaceful spite bustling location breakfast delicious continental better normally come expect term, best feeling tranquility busy day town return oasis calm enjoy delicious hazelnut coffee buns head room freshen coming later cheese prosecco sound piano music background.ok n't exactly cheap, new york, stayed expensive luxury hotels felt mightily ripped, not time- definitely casablanca,  "

# Building Prompt

In [6]:
import requests

API_URL = "http://nexusraven.nexusflow.ai"

headers = {
    "Content-Type": "application/json"
}

def query(payload):
    response = requests.post(API_URL, json=payload, headers=headers)
    return response.json()

def query_raven(prompt):
    payload = {
        "inputs": prompt,
        "parameters": {"do_sample": True, "temperature": 0.001, "max_new_tokens": 400, "stop": ["<bot_end>", "Thought:"], "return_full_text": False}
    }
    return query(payload)[0]["generated_text"].replace("Call:", "").replace("Thought:", "").strip()

# Defining what's important

In [7]:
from typing import List
from dataclasses import dataclass

In [8]:
schema_id = {"hotel_name", "location", "likes", "dislikes", "sentiment"}

In [9]:
dataclass_schema_representation = '''
@dataclass
class Record:
    hotel_name: str # Name of the hotel if provided, else ''
    location: str # Location of the hotel if provided, else ''
    likes: str # What the customers liked about the hotel if provided, else ''
    dislikes: str # What the customers disliked about the hotel if provided, else ''
    sentiment: str # Overall customer sentiment, either 'frustrated' or 'happy'
'''

In [10]:
exec(dataclass_schema_representation)

# Building the database

In [27]:
def initialize_db():
    import sqlite3

    # Connect to SQLite db
    conn = sqlite3.connect("tripadvisor.db")
    c = conn.cursor()

    # Table name
    table_name = "reviews"

    # Schema
    columns = '''
    id INTEGER PRIMARY KEY,
    hotel_name TEXT,
    location TEXT,
    likes TEXT,
    dislikes TEXT,
    sentiment TEXT
    '''

    # Check if the table already exists
    c.execute(f"SELECT * FROM sqlite_master WHERE type='table' AND name='{table_name}'")
    if c.fetchone() is None:
        # Create table
        c.execute(f"CREATE TABLE {table_name} ({columns})")

    # Commit the transaction and close the connection
    conn.commit()
    conn.close()

In [41]:
initialize_db()

# Create a tool to add data in the database

In [42]:
from dataclasses import dataclass, fields

def update_db(results: List[Record]):
    """
    Registers the information
    """
    import sqlite3
    from sqlite3 import ProgrammingError

    # Connect to SQLite db
    conn = sqlite3.connect("tripadvisor.db")
    c = conn.cursor()

    # Table name
    table_name = "reviews"

    # Insert records
    column_names = "hotel_name, location, likes, dislikes, sentiment"
    placeholders = ", ".join(["?"] * 5)

    sql = f"INSERT INTO {table_name} ({column_names}) VALUES ({placeholders})"

    for record in results:
        try:
            values = tuple(getattr(record, field.name) for field in fields(record))
            print(values)
            c.execute(sql, values)
        except ProgrammingError:
            print("Error with record.")
            pass

    # Commit the transaction and close the connection
    conn.commit()
    conn.close()

In [43]:
sample_record = Record(hotel_name="hotel casablanca", location="new york", likes="staff extremely friendly helpful rooms reasonable size clean", dislikes="", sentiment="happy")

In [44]:
update_db([sample_record])

('hotel casablanca', 'new york', 'staff extremely friendly helpful rooms reasonable size clean', '', 'happy')


# Create a tool to extract information from the database

In [45]:
import sqlite3

def execute_sql(sql: str):
    """
    Runs the SQL code for the given schema. Make sure to properly leverage the schema to answer the user's question in the best way possible.
    """
    table_name = "reviews"

    # Connect to SQLite db
    conn = sqlite3.connect("tripadvisor.db")
    c = conn.cursor()

    # Execute the SQL query
    results = c.execute(sql).fetchall()

    # Commit the transaction and close the connection
    conn.commit()
    conn.close()

    return results

In [46]:
sql = '''SELECT * FROM reviews WHERE sentiment = "happy"'''

print("Executing SQL: ", sql)

execute_sql(sql)

Executing SQL:  SELECT * FROM reviews WHERE sentiment = "happy"


[(1,
  'hotel casablanca',
  'new york',
  'staff extremely friendly helpful rooms reasonable size clean',
  '',
  'happy')]

# Building the pipeline

In [47]:
data = ds["train"]["text"][3]
data

'great deal waikiki trip hawaii outrigger luana great hotel/condo, booked city view kitchenette, kitchen amenities ask, nice pool cabanas sun gets, barbeque area feel like getting fresh fish market highly recommended, staff friendly accommodating allowing access garage shower room 8 hours checked, 18/day parking little steep want explore life waikiki great hotel beginning main strip, tour desk not open probably went season desk touch tour desks outriggers waikiki happy make arrangements, beach minute walk away pay upwards 100/night right beach,  '

In [48]:
import inspect

prompt = "\n" + data

signature = inspect.signature(update_db)
signature = str(signature).replace("__main__.Record", "Record")
docstring = update_db.__doc__

raven_prompt = f'''{dataclass_schema_representation}\nFunction:\n{update_db.__name__}{signature}:\n   """{docstring}"""\n\n\nUser Query:{prompt}<human_end>'''
print(raven_prompt)


@dataclass
class Record:
    hotel_name: str # Name of the hotel if provided, else ''
    location: str # Location of the hotel if provided, else ''
    likes: str # What the customers liked about the hotel if provided, else ''
    dislikes: str # What the customers disliked about the hotel if provided, else ''
    sentiment: str # Overall customer sentiment, either 'frustrated' or 'happy'

Function:
update_db(results: List[Record]):
   """
    Registers the information
    """


User Query:
great deal waikiki trip hawaii outrigger luana great hotel/condo, booked city view kitchenette, kitchen amenities ask, nice pool cabanas sun gets, barbeque area feel like getting fresh fish market highly recommended, staff friendly accommodating allowing access garage shower room 8 hours checked, 18/day parking little steep want explore life waikiki great hotel beginning main strip, tour desk not open probably went season desk touch tour desks outriggers waikiki happy make arrangements, beach minu

In [49]:
raven_call = query_raven(raven_prompt)
print(raven_call)

update_db(results=[])


In [50]:
exec(raven_call)

In [51]:
prompt = "how many reviews are there where customers are happy"

signature = inspect.signature(execute_sql)

docstring = execute_sql.__doc__

sql_schema_representation = \
"""
CREATE TABLE reviews (
    id INTEGER PROMARY KEY,
    hotel_name TEXT,
    location TEXT,
    likes TEXT,
    dislikes TEXT,
    sentiment TEXT
)
"""

raven_prompt = f'''{sql_schema_representation}\nFunction:\n{execute_sql.__name__}{signature}:\n   """{docstring}"""\n\n\nUser Query:{prompt}<human_end>'''

print(raven_prompt)


CREATE TABLE reviews (
    id INTEGER PROMARY KEY,
    hotel_name TEXT,
    location TEXT,
    likes TEXT,
    dislikes TEXT,
    sentiment TEXT
)

Function:
execute_sql(sql: str):
   """
    Runs the SQL code for the given schema. Make sure to properly leverage the schema to answer the user's question in the best way possible.
    """


User Query:how many reviews are there where customers are happy<human_end>


In [52]:
raven_call = query_raven(raven_prompt)
print(raven_call)

execute_sql(sql='SELECT COUNT(*) FROM reviews WHERE sentiment = "happy"')


In [53]:
eval(raven_call)

[(1,)]

# Let's re-initialize and run the workflow

In [54]:
!rm tripadvisor.db
initialize_db()

In [59]:
from tqdm import tqdm

# Taking data points
data = ds["train"]["text"][10:20]

for i in tqdm(range(0, 10)):

    # Ask raven to extract information we want out of the review
    raven_prompt = "\n" + data[i]
    signature = inspect.signature(update_db)
    docstring = update_db.__doc__
    raven_prompt = f'''{dataclass_schema_representation}\nFunction:\n{update_db.__name__}{signature}:\n   """{docstring}"""\n\n\nUser Query:{raven_prompt}<human_end>'''

    raven_call = query_raven(raven_prompt)
    exec(raven_call)

 10%|█         | 1/10 [00:01<00:10,  1.12s/it]

('excellent', 'beginning august', 'good value', 'expensive bar', 'happy')


 20%|██        | 2/10 [00:01<00:07,  1.03it/s]

('', '', '', '', '')


 30%|███       | 3/10 [00:02<00:06,  1.08it/s]

('', '', '', '', '')


 70%|███████   | 7/10 [00:04<00:01,  1.57it/s]

('', '', '', '', '')


 80%|████████  | 8/10 [00:06<00:01,  1.13it/s]

('Florence', 'Rome', 'modern room, comfortable bathroom, good breakfast', 'late check-in, overcrowded lobby', 'happy')


 90%|█████████ | 9/10 [00:07<00:00,  1.01it/s]

('cambridge suites', 'sept 29-oct 3', 'wonderful', 'none', 'happy')


100%|██████████| 10/10 [00:08<00:00,  1.23it/s]


In [60]:
signature = inspect.signature(execute_sql)
docstring = execute_sql.__doc__
sql_schema_representation = \
"""
CREATE TABLE reviews (
    id INTEGER PROMARY KEY,
    hotel_name TEXT,
    location TEXT,
    likes TEXT,
    dislikes TEXT,
    sentiment TEXT
)
"""

raven_prompt = f'''{sql_schema_representation}\nFunction:\n{execute_sql.__name__}{signature}:\n   """{docstring}"""\n\n\nUser Query:{prompt}<human_end>'''
raven_prompt = raven_prompt + "User Query: How many are happy customers?<human_end>"
print(raven_prompt)

raven_call = query_raven(raven_prompt)
print(raven_call)

eval(raven_call)


CREATE TABLE reviews (
    id INTEGER PROMARY KEY,
    hotel_name TEXT,
    location TEXT,
    likes TEXT,
    dislikes TEXT,
    sentiment TEXT
)

Function:
execute_sql(sql: str):
   """
    Runs the SQL code for the given schema. Make sure to properly leverage the schema to answer the user's question in the best way possible.
    """


User Query:how many reviews are there where customers are happy<human_end>User Query: How many are happy customers?<human_end>
execute_sql(sql='SELECT COUNT(*) FROM reviews WHERE sentiment = "happy"')


[(6,)]

In [61]:
raven_prompt = f'''{sql_schema_representation}\nFunction:\n{execute_sql.__name__}{signature}:\n   """{docstring}"""\n\n\nUser Query:{prompt}<human_end>'''
raven_prompt = raven_prompt + "User Query: Give me a list of hotel names where customers are happy?<human_end>"
print(raven_prompt)

raven_call = query_raven(raven_prompt)
print(raven_call)

eval(raven_call)


CREATE TABLE reviews (
    id INTEGER PROMARY KEY,
    hotel_name TEXT,
    location TEXT,
    likes TEXT,
    dislikes TEXT,
    sentiment TEXT
)

Function:
execute_sql(sql: str):
   """
    Runs the SQL code for the given schema. Make sure to properly leverage the schema to answer the user's question in the best way possible.
    """


User Query:how many reviews are there where customers are happy<human_end>User Query: Give me a list of hotel names where customers are happy?<human_end>
execute_sql(sql='SELECT hotel_name, sentiment FROM reviews WHERE sentiment = "happy"')


[('excellent', 'happy'),
 ('Florence', 'happy'),
 ('cambridge suites', 'happy'),
 ('excellent', 'happy'),
 ('Florence', 'happy'),
 ('cambridge suites', 'happy')]