In [9]:
# Reset the environment
%reset -f

# Import system libraries
import sys
import os

# Go one level up and then navigate to the correct directory
project_root = os.path.abspath('../chinook-database-master')  # Going one level up and into the root
src_dir = os.path.join(project_root, 'src/modules')  # Path to the src/modules folder

# Change directory to the project root if necessary
if os.getcwd() != project_root:
    os.chdir(project_root)

# Ensure 'src/modules' is added to the system path
sys.path.append(src_dir)

# Import Pipeline after ensuring correct path
from pipeline import Pipeline  # Since 'pipeline.py' is inside 'modules'

# Initialize OpenAI API
import openai
openai.api_key = os.getenv('OPENAI_API_KEY')  # Use environment variable for API key

# Set the path to your SQLite database and the PDF documentation
db_file = os.path.join(project_root, 'ChinookDatabase/DataSources/Chinook_Sqlite.sqlite')
pdf_path = os.path.join(project_root, 'docs/Chinook_Database_Enriched_Documentation.pdf')
user_query = "List who have purchased tracks from more than 7 different genres."

# Initialize the pipeline
pipeline = Pipeline(db_file, user_query, pdf_path)

# Run the pipeline
analysis = pipeline.run()


ModuleNotFoundError: No module named 'src'

In [4]:
# Run the full pipeline
user_query = " List all employees who have sold tracks to customers in the same city."

pipeline = Pipeline(db_file, user_query, pdf_path)

# Run the pipeline
analysis = pipeline.run()

"""

SELECT Employee.FirstName, Employee.LastName, Customer.FirstName AS CustomerFirstName, Customer.LastName AS CustomerLastName, Customer.City
FROM Employee
JOIN Customer ON Employee.City = Customer.City
JOIN Invoice ON Customer.CustomerId = Invoice.CustomerId
JOIN InvoiceLine ON Invoice.InvoiceId = InvoiceLine.InvoiceId
WHERE Employee.EmployeeId = InvoiceLine.InvoiceId;



"""

Retrieved relevant documents based on similarity threshold of 0.75:
Extracting schema and foreign key relationships...
Extracting keywords using RoBERTa...
Extracted Keywords:
0      customers in
1     all employees
2         same city
3         tracks to
4                to
5         customers
6          the same
7              same
8      to customers
9               all
10             List
11             city
12             have
13         who have
14         List all
15        have sold
16    employees who
17      sold tracks
18           in the
19              who
20           tracks
21               in
22             sold
23        employees
24              the
dtype: object
Mapping keywords to schema...
Relevant Tables: {'Customer', 'Track', 'Employee'}
Added necessary table: Invoice

Top 10 Similarity Scores between Keywords and Columns:
          Keyword      Column  Similarity
6            city        City    1.000000
7            city        City    1.000000
2       same cit

Unnamed: 0,EmployeeId,LastName,FirstName,Title,ReportsTo,BirthDate,HireDate,Address,City,State,Country,PostalCode,Phone,Fax,Email


Analyzing result with LLM...

Analysis:
The results table is empty, so there are no employees who have sold tracks to customers in the same city.


'\n\nSELECT Employee.FirstName, Employee.LastName, Customer.FirstName AS CustomerFirstName, Customer.LastName AS CustomerLastName, Customer.City\nFROM Employee\nJOIN Customer ON Employee.City = Customer.City\nJOIN Invoice ON Customer.CustomerId = Invoice.CustomerId\nJOIN InvoiceLine ON Invoice.InvoiceId = InvoiceLine.InvoiceId\nWHERE Employee.EmployeeId = InvoiceLine.InvoiceId;\n\n\n\n'

In [5]:
# Define the new user query
user_query = "Show the top 5 most purchased tracks along with the artist name."

# Initialize the pipeline
pipeline = Pipeline(db_file, user_query, pdf_path)

# Run the pipeline
analysis = pipeline.run()

# Print the analysis
print(analysis)


"""
    
SELECT Track.Name AS TrackName, Artist.Name AS ArtistName, COUNT(InvoiceLine.TrackId) AS PurchaseCount
FROM Track
JOIN Album ON Track.AlbumId = Album.AlbumId
JOIN Artist ON Album.ArtistId = Artist.ArtistId
JOIN InvoiceLine ON Track.TrackId = InvoiceLine.TrackId
GROUP BY Track.TrackId, Artist.Name
ORDER BY PurchaseCount DESC
LIMIT 5;

    
"""

Retrieved relevant documents based on similarity threshold of 0.75:
Extracting schema and foreign key relationships...
Extracting keywords using RoBERTa...
Extracted Keywords:
0           the artist
1             Show the
2          artist name
3                 with
4                  top
5                 Show
6       most purchased
7     purchased tracks
8         tracks along
9             with the
10          along with
11             the top
12                most
13                name
14              tracks
15               along
16              artist
17                 the
18           purchased
dtype: object
Mapping keywords to schema...
Relevant Tables: {'Artist', 'Track'}
Added necessary table: Customer
Added necessary table: Invoice

Top 10 Similarity Scores between Keywords and Columns:
       Keyword     Column  Similarity
4         name       Name    1.000000
5         name       Name    1.000000
0   the artist   ArtistId    0.761226
8       artist   ArtistId    0.7580

Unnamed: 0,Artist Name,Track Name,Number of Purchases
0,Emerson String Quartet,"String Quartet No. 12 in C Minor, D. 703 ""Quar...",2
1,"Equale Brass Ensemble, John Eliot Gardiner & M...","Music for the Funeral of Queen Mary: VI. ""Thou...",2
2,"Academy of St. Martin in the Fields, Sir Nevil...","Suite No. 3 in D, BWV 1068: III. Gavotte I & II",2
3,Amy Winehouse,Rehab,2
4,Michael Tilson Thomas & San Francisco Symphony,"Symphonie Fantastique, Op. 14: V. Songe d'une ...",2


Analyzing result with LLM...

Analysis:
The top 5 most purchased tracks along with the artist name are:

1. "String Quartet No. 12 in C Minor, D. 703 "Quartettsatz"" by Emerson String Quartet
2. "Music for the Funeral of Queen Mary: VI. "Thou Knowest, Lord, the Secrets of Our Hearts"" by Equale Brass Ensemble, John Eliot Gardiner & Monteverdi Orchestra
3. "Suite No. 3 in D, BWV 1068: III. Gavotte I & II" by Academy of St. Martin in the Fields, Sir Neville Marriner
4. "Rehab" by Amy Winehouse
5. "Symphonie Fantastique
The top 5 most purchased tracks along with the artist name are:

1. "String Quartet No. 12 in C Minor, D. 703 "Quartettsatz"" by Emerson String Quartet
2. "Music for the Funeral of Queen Mary: VI. "Thou Knowest, Lord, the Secrets of Our Hearts"" by Equale Brass Ensemble, John Eliot Gardiner & Monteverdi Orchestra
3. "Suite No. 3 in D, BWV 1068: III. Gavotte I & II" by Academy of St. Martin in the Fields, Sir Neville Marriner
4. "Rehab" by Amy Winehouse
5. "Symphonie Fantas

'\n    \nSELECT Track.Name AS TrackName, Artist.Name AS ArtistName, COUNT(InvoiceLine.TrackId) AS PurchaseCount\nFROM Track\nJOIN Album ON Track.AlbumId = Album.AlbumId\nJOIN Artist ON Album.ArtistId = Artist.ArtistId\nJOIN InvoiceLine ON Track.TrackId = InvoiceLine.TrackId\nGROUP BY Track.TrackId, Artist.Name\nORDER BY PurchaseCount DESC\nLIMIT 5;\n\n    \n'