# Below is the simple Web Scrapper for extract table:
Assume that we know a bit about table


In [33]:
import os
from dotenv import dotenv_values, load_dotenv, find_dotenv
print(os.getcwd())
dd = load_dotenv(find_dotenv())
config = dotenv_values(".env") 


/home/lscm/Project/SimpleCode/simple-llm-based-scrapper


## web scraper

In [31]:
import requests

from bs4 import BeautifulSoup
import pandas as pd
import time

# URL of the website that contain table related to HS Code
url = "https://www.transcustoms.com/China_HS_Search.asp?word=271019"

# Send an HTTP GET request to the website
response = requests.get(url)

# Parse the HTML code using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')


## LLM Part

In [34]:
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
# Please Change the ENV Variables according to your own setting
################################################################

llm_model = AzureChatOpenAI(
    temperature=0,
     deployment_name=f"{os.getenv('AZURE_OPENAI_MODEL_TURBO')}",
      openai_api_version=f"{os.getenv('AZURE_OPENAI_API_VERSION_TURBO')}",
      openai_api_key=f"{os.getenv('AZURE_OPENAI_API_KEY_TURBO')}",
      azure_endpoint=f"{os.getenv('AZURE_OPENAI_ENDPOINT_TURBO')}",
    verbose=True
)


In [35]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser

promptTmpl = ChatPromptTemplate.from_template(
    """
                                Please extract the table related to HS编码 and HS系统商品规范中英文名称 from the following text:
                                {text}
                                
                                Return a JSON object with the following format:
                                
                                ```json
                                {{"result:[{{"HS编码":<hscode>, "HS系统商品规范中英文名称":<product>}}, {{"HS编码":<hscode>, "HS系统商品规范中英文名称":<product>}}]}}
                                ```
                                """)

def exractTable(text):
    query = {"text":text} 
    result = None
    OK = False
    try:
        chain = ( promptTmpl | llm_model | JsonOutputParser() )
        Obj = chain.invoke(query)
        result = list(Obj.values())[0]
        OK = True
    except:
        chain = ( promptTmpl | llm_model | StrOutputParser() )
        result = chain.invoke(query)
        print("============== Invalid JSON =================> stroutputparser used")
        OK = False
    prompt_text = promptTmpl.format(text=text)
    return result, prompt_text, OK

## Run extracTable

In [36]:
result, prompt_text, OK =  exractTable(soup.text)

In [47]:
from json import JSONDecoder

def extract_json_objects(text, decoder=JSONDecoder()):
    """Find JSON objects in text, and yield the decoded JSON data

    Does not attempt to look for JSON arrays, text, or other JSON types outside
    of a parent JSON object.

    """
    pos = 0
    while True:
        match = text.find('{', pos)
        if match == -1:
            break
        try:
            result, index = decoder.raw_decode(text[match:])
            yield result
            pos = match + index
        except ValueError:
            pos = match + 1

# data = """ {\n  "result": [\n    {\n      "HS编码": "2710199910",\n      "HS系统商品规范中英文名称": "White oil (a colorless and transparent oil-like liquid composed of liquid hydrocarbon mixture, obtained from crude oil fractionation) the commercial component is 100% white mineral oil, and the viscosity of the product is 65 at 40 ℃"\n    },\n    {\n      "HS编码": "2710199200",\n      "HS系统商品规范中英文名称": "Lubricating grease, other those containing biodiesel and other than waste oils"\n    },\n    {\n      "HS编码": "2710192200",\n      "HS系统商品规范中英文名称": "Fuel oils No.5- No.7, other than those containing biodiesel"\n    },\n    {\n      "HS编码": "2710192990",\n      "HS系统商品规范中英文名称": "Other diesel oils and other fuel oils, other than those containing biodiesel"\n    },\n    {\n      "HS编码": "2710191100",\n      "HS系统商品规范中英文名称": "Aviation kerosene, other than those containing biodiesel"\n    },\n    {\n      "HS编码": "2710199400",\n      "HS系统商品规范中英文名称": "Liquid paraffin and heavy liquid paraffin, other than those containing biodiesel"\n    },\n    {\n      "HS编码": "2710191910",\n      "HS系统商品规范中英文名称": "Normal paraffin(C9-C13), other than those containing biodiesel"\n    },\n    {\n      "HS编码": "2710192910",\n      "HS系统商品规范中英文名称": "Paraffin oils(which less than 20% by volume distils at below 350℃, which more than 80% by volume distils at below 550℃), other than those containing biodiesel"\n    },\n    {\n      "HS编码": "2710191920",\n      "HS系统商品规范中英文名称": "Isoalkane solvent, excluding biodiesel with initial boiling point of 225 ℃, flash point of 92 ℃, density of 0.79g/cm3"\n    },\n    {\n      "HS编码": "2710191990",\n      "HS系统商品规范中英文名称": "Other kerosene distillages oils and preparations, other than those containing biodiesel"\n    },\n    {\n      "HS编码": "2710191200",\n      "HS系统商品规范中英文名称": "Lamp-kerosene, other than those containing biodiesel"\n    },\n    {\n      "HS编码": "2710199390",\n      "HS系统商品规范中英文名称": "Basic oils for lubricating oils, other than those containing biodiesel"\n    },\n    {\n      "HS编码": "2710192300",\n      "HS系统商品规范中英文名称": "Diesel oil for Vehicle,excluding biodiesel"\n    },\n    {\n      "HS编码": "2710199100",\n      "HS系统商品规范中英文名称": "Lubricating oils, other than those containing biodiesel and other than waste oils"\n    },\n    {\n      "HS编码": "2710199310",\n      "HS系统商品规范中英文名称": "Basic oils for lubricating oils, other than those containing biodiesel"\n    },\n    {\n      "HS编码": "2710199990",\n      "HS系统商品规范中英文名称": "Other heavy oil; other heavy oil products, excluding biodiesel, including products with oil content ≥ 70% by weight"\n    }\n  ]\n}\n"""
# json_s = extract_json_objects(data)
# for item in json_s:
#     print(item)

In [50]:
import pprint


if OK: 
    pprint.pprint(result)
else:
    json_s = extract_json_objects(data)
    if len(json_s)>0:
        pprint.pprint(json_s[0])
    else:
        print("Something is wrong, this is the raw text: ", result)

[{'HS系统商品规范中英文名称': 'White oil (a colorless and transparent oil-like liquid '
                   'composed of liquid hydrocarbon mixture, obtained from '
                   'crude oil fractionation) the commercial component is 100% '
                   'white mineral oil, and the viscosity of the product is 65 '
                   'at 40 ℃',
  'HS编码': '2710199910'},
 {'HS系统商品规范中英文名称': 'Lubricating grease, other those containing biodiesel and '
                   'other than waste oils',
  'HS编码': '2710199200'},
 {'HS系统商品规范中英文名称': 'Fuel oils No.5- No.7, other than those containing '
                   'biodiesel',
  'HS编码': '2710192200'},
 {'HS系统商品规范中英文名称': 'Other diesel oils and other fuel oils, other than those '
                   'containing biodiesel',
  'HS编码': '2710192990'},
 {'HS系统商品规范中英文名称': 'Aviation kerosene, other than those containing biodiesel',
  'HS编码': '2710191100'},
 {'HS系统商品规范中英文名称': 'Liquid paraffin and heavy liquid paraffin, other than '
                   'those con