In [11]:
#........Importing Required Libraries........#
from io import BytesIO
import json
import pandas as pd
import fitz
import streamlit as st
from streamlit_chat import message
from typing_extensions import TypedDict
import openai
from openai import OpenAI
import numpy as np
from IPython.display import Image, display

from langgraph.graph import END, START, StateGraph
from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod, NodeStyles

In [12]:
# Using Openai paid key
openai.api_key = "YOUR_API_KEY" 
gpt_model = 'MODEL_NAME'  
embedding_model = "EMBEDDING_MOEL"
client = OpenAI(api_key = "YOUR_API_KEY")

In [13]:
input_prompt = """Identify the specifications from the provided document context and extract the values for the identified specifications from the context. 
                      Provide the extracted specifications from the provided context in a structured JSON format. Each specification should have an value entry. 
                      The values should also include the associated units if available. If a specification is missing, include 'N/A' for that entry.
                      The first two entires of the specification should be 'company' and 'product/Model Number'. Extract as many specifications as possible from the provided contexts.
                      If the fetched values has any special characters like backslash etc., convert the values such that it will not create any issues while parsing the JSON. 
                      If the extracted value for any specification is in nested format, convert them into a list of key value pair and stick to the required format of JSON. Don't output in nested json format. Don't use same keys again and again, but instead merge similar key information into a list of strings.
                      Format your response as: 
                      {
                        "company": ["Value from context1"],
                        "product/Model Number": ["Value from context1"],
                        "Specification 3": ["Value from context1"], 
                        "Specification 4": ["Value from context1"], 
                        ... 
                      }
                      
                      """ 


In [14]:
# Function to extract text from PDF file
def extract_text_from_pdf(file) -> str:
    pdf_reader = fitz.open(file)
    full_text = ""
    page_texts = []
    # for page_num in range(pdf_reader.getNumPages()):
    for page_num in range(pdf_reader.page_count):
        page = pdf_reader[page_num]
        text = page.get_text("text") 
        blocks = page.get_text("blocks")

        processed_blocks = []
        for b in blocks:
            block_text = b[4].strip()
            if block_text:
                if ":" in block_text:
                    processed_blocks.append(block_text)
                else:
                    processed_blocks.append(block_text.replace("\n", " "))
        processed_text = "\n".join(processed_blocks)
        full_text += processed_text + "\n\n"
        page_texts.append(processed_text)
        # st.session_state.pdf_text = page_texts
    return page_texts


#......Function to extract specs from the input datasheet text......#
def extract_specs(pdf1_text: str, input_prompt: str) -> str:
    text1 = pdf1_text
    # text2 = pdf2_text
    # .....Generating the responses using the two input contexts......#
    response = client.chat.completions.create(
        model=gpt_model, 
        messages = [ {"role": "assistant", "content": input_prompt
                      },
                      {"role": "system", "content": f"context1: {text1}"},
                    #   {"role": "user", "content": query_text}
                      ], #prompt=prompt_template,
        max_tokens=1200,
        temperature = 0.1,
    )
    specifications = response.choices[0].message.content
    return specifications

#.....Function to rewrite the input prompt......#
def rewrite_query(input_prompt: str) -> str:
    response = client.chat.completions.create(
        model=gpt_model, 
        messages = [ {"role": "assistant", "content": """You are a question re-writer that converts an input question to a better version that is optimized to fetch accurate answer from the input document. 
                      Look at the input and try to reason about the underlying semantic intent / meaning. Provide the response in a string format enclosed in thrible quotes.
                      
                      input question: {input_prompt}
                      """
                      },
                    #   {"role": "system", "content": f"context1: {text1}"},
                    #   {"role": "user", "content": query_text}
                      ], #prompt=prompt_template,
        max_tokens=600,
        temperature = 0.1,
    )
    rewrited_prompt = response.choices[0].message.content
    return rewrited_prompt

#..........Fetching additional information that seems to be missing from datasheet..........#
def fetch_additional_info(specifications_list: list) -> str:
    
    #.....Defining the specifications to be extracted from the context.....#
    company_name = specifications_list[0]
    product_model = specifications_list[1]
    missing_specs = specifications_list[2:]

    response = client.chat.completions.create(
        model=gpt_model, 
        messages = [ {"role": "assistant", "content": """You are an expert in identified specifications when provided with Company Name and Product Model Number. Given a list of specifications to be extracted, you are required to use the given company name and model number and fetch the values for all the provided specifications. Inputs are:
                      Company Name: {company_name}
                      Model Number: {product_model}
                      and the list of specifications to be extracted: {missing_specs}
                      
                      For all the specifications to be fetched provide the identified response in a structured JSON format. Each specification should have an value entry. 
                      The values should also include the associated units if available. If a specification is missing, include 'N/A' for that entry. For all the non-missing specs include the text '(from internet)' along with the values.
                      If the fetched values has any special characters, convert the values such that it will not create any issues while parsing the JSON.

                      Format your response as: 
                      {
                        "Specification 1": ["Value (from internet)"], ..
                        "Specification 2": ["Value (from internet)"],
                        ... 
                      }
                      """
                      },
                      # {"role": "system", "content": f"context1: {text1}"},
                      # {"role": "user", "content": query_text}
                      ], #prompt=prompt_template,
        max_tokens=1200,
        temperature = 0.1,
    )
    specifications = response.choices[0].message.content
    return specifications


In [15]:
#..........Functions to handle different user selections..........#
#.....Function to handle situation when the user selects correct.....#
def handle_correct_action(state):
    return {"filtered_specs": state["specifications"], "additional_info": None}
    # return state

#.....Function to handle situation when the user selects incomplete.....#
def handle_incomplete_action(specs):
    specifications = json.loads(specs)
    keywords_list = ["Display", "Battery", "Processor", "Memory", "Storage", "Operating System", "Price", "Weight", "Connectivity", "Guarantee", "Graphics", "Battery Life", "Material", "Refresh Rate"]
    # relevance_score = 0
    missed_keywords = ["company", "Model Number"]  # Initialize missed keywords with company and product/model number
    # threshold = 12  # Define a threshold for relevance
    
    # Get the list of keys from the extracted specifications
    spec_keys = [key for key, value in specifications.items()]
    
    # Find the missed keywords
    for keyword in keywords_list:
        if keyword not in spec_keys:
            missed_keywords.append(keyword) 

    # Find keys with value 'N/A'
    keys_with_na = [key for key, value in specifications.items() if value == ["N/A"]]
    # filtered_specs = {key: value for key, value in specifications.items() if value != ["N/A"]}

    all_missed_keys = missed_keywords + keys_with_na
    additional_info = fetch_additional_info(all_missed_keys)

    return additional_info
    # return state


#.....Function to handle situation when the user selects incorrect.....#
def handle_incorrect_action(pdf_text, input_prompt):
    
    # rewrited_prompt = rewrite_query(input_prompt)
    # specifications = extract_specs(specs, rewrited_prompt)
    specifications = extract_specs(pdf_text, input_prompt)
    # print("Specs before loading")
    # print(specifications)

    try:
        # Convert JSON string to a Python dictionary
        if isinstance(specifications, str):  # Ensure it's a string before parsing
            spec_type = type(specifications)
            specifications = json.loads(specifications)
    except:
        print("error in loading spec as json")

    # print("Specs after loading")
    # print(specifications)

    # specifications = json.loads(specifications)
    keywords_list = ["Display", "Battery", "Processor", "Memory", "Storage", "Operating System", "Price", "Weight", "Connectivity", "Guarantee", "Graphics", "Battery Life", "Material", "Refresh Rate"]
    # relevance_score = 0
    missed_keywords = ["company", "Model Number"]  # Initialize missed keywords with company and product/model number
    
    # Get the list of keys from the extracted specifications
    spec_keys = [key for key, value in specifications.items()]
    
    # Find the missed keywords
    for keyword in keywords_list:
        if keyword not in spec_keys:
            missed_keywords.append(keyword) 

    # Find keys with value 'N/A'
    keys_with_na = [key for key, value in specifications.items() if value == ["N/A"]]
    filtered_specs = {key: value for key, value in specifications.items() if value != ["N/A"]}

    all_missed_keys = missed_keywords + keys_with_na
    additional_info = fetch_additional_info(all_missed_keys)
    print("type of additional_info: ", type(additional_info))

    return filtered_specs, additional_info
    # return state


In [16]:
uploaded_file_1 = r"C:\Users\INBHV3\OneDrive - ABB\Bhanu Work Files\Implementations\Doc Comparison\doc-comparison-main\inputs\Acer-Nitro5-specs.pdf"
# uploaded_file_2 = r"C:\Users\INBHV3\OneDrive - ABB\Bhanu Work Files\Implementations\Doc Comparison\doc-comparison-main\inputs\alienware-17-specs.pdf"

class State(TypedDict):
    uploaded_file_1: bytes
    pdf1_text: str
    specifications: str

#..........Create the graph..........#
graph_builder = StateGraph(State)

#..........Define the nodes..........#
graph_builder.add_node("extract_text_doc1", lambda state: {"pdf1_text": extract_text_from_pdf(state["uploaded_file_1"])})
# graph_builder.add_node("extract_text_doc2", lambda state: {"pdf2_text": extract_text_from_pdf(state["uploaded_file_2"])})
graph_builder.add_node("extract_specs", lambda state: {"specifications": extract_specs(state["pdf1_text"], input_prompt)})

# Define the edges that connects the nodes
graph_builder.add_edge(START, "extract_text_doc1")
graph_builder.add_edge("extract_text_doc1","extract_specs")
graph_builder.add_edge("extract_specs", END)

# Execute the graph
graph = graph_builder.compile()

# Assuming pdf1_file and pdf2_file are the uploaded PDF files in bytes
input_state = {
    "uploaded_file_1": uploaded_file_1,
}

# Execute the graph
result = graph.invoke(input_state)
# Access the extracted specifications
specifications = result["specifications"]
pdf_text = result["pdf1_text"]

In [None]:
# filtered_specs, additional_info = handle_incorrect_action(pdf_text, input_prompt)
# print(filtered_specs)
# print(additional_info)
# print(spec_type)

In [37]:
# print(specifications)
# data3 = json.loads(additional_info)
# df3 = pd.DataFrame(data3)

In [17]:
import ast

if specifications:
    # print("Specifications extracted from the provided datasheet")
    # print(specifications)

    print("\n print type of pdf_text : ", type(pdf_text))

    #.....START OF FUNCTION.....#
    def feedback_loop(button_pressed, specifications, pdf_text):
        
        # Define edges for button actions
        if button_pressed == "Correct":
            filtered_specs = specifications
            additional_info = None
        elif button_pressed == "Incorrect":
            # pdf_text = extract_text_from_pdf(uploaded_file_1)
            filtered_specs, additional_info = handle_incorrect_action(pdf_text, input_prompt)
            print("Query rewrited and done a fresh search. Fetched additional information from the internet")
        elif button_pressed == "Incomplete":
            filtered_specs = specifications
            additional_info = handle_incomplete_action(filtered_specs)
            print("Additional information fetched from the internet")
        # else:
        #     st.write("You haven't selected a button")
            # graph_builder.add_edge("display_output", END)
        # Close the edges based on consditional selections of the above three edges
        
        # Parse the JSON response
        if filtered_specs:
            try:
                data2 = json.loads(additional_info)

                df1 = pd.DataFrame(filtered_specs)
                df2 = pd.DataFrame(data2)

                df1 = df1.transpose().reset_index()
                df2 = df2.transpose().reset_index()

                # Concatenate the DataFrames
                combined_df = pd.concat([df1, df2], ignore_index=True)

                combined_df.columns = ['Specification', 'Extracted Information']
                print("JSON data loaded successfully.")

                # Convert DataFrame to Excel
                output = BytesIO()
                with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                    combined_df.to_excel(writer, index=False, sheet_name='comparison_results')
                    writer.close()
                excel_data = output.getvalue()
                
                
                print("Specifications not available in datasheet by fetched by the model")
                print(df2)

                
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
        else:
            print("spec_list is empty or None.")
    #.....END OF FUNCTION......#

    # feedback_loop("Correct", specifications, pdf_text)
    feedback_loop("Incorrect", specifications, pdf_text)
    # feedback_loop("Incomplete", specifications, pdf_text)



 print type of pdf_text :  <class 'list'>
type of additional_info:  <class 'str'>
Query rewrited and done a fresh search. Fetched additional information from the internet
JSON data loaded successfully.
Specifications not available in datasheet by fetched by the model
               index                                    0
0         Resolution   3840 x 2160 pixels (from internet)
1       Refresh Rate               120 Hz (from internet)
2        HDR Support                  Yes (from internet)
3        Screen Size            55 inches (from internet)
4       Audio Output                  20W (from internet)
5         HDMI Ports                    4 (from internet)
6          USB Ports                    3 (from internet)
7          Bluetooth                  Yes (from internet)
8              Wi-Fi                  Yes (from internet)
9      Ethernet Port                  Yes (from internet)
10            Weight              18.5 kg (from internet)
11        Dimensions  1230 x 780 x 

  warn("Calling close() on already closed file.")


In [7]:
# data = pd.DataFrame(filtered_specs)