# Import Libraries

In [3]:
from openai import OpenAI
from dotenv import load_dotenv
import nest_asyncio
import os

import base64
import json
from time import perf_counter
from typing import  Union

import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

import html_text
import nest_asyncio
from multidict import CIMultiDict
from openai import OpenAI
from openai.types.chat.chat_completion import ChatCompletion
from pydantic import BaseModel
from w3lib.encoding import html_to_unicode, resolve_encoding
from zyte_api import ZyteAPI
pd.set_option('display.max_columns', None)

# Load env file

In [4]:
# Load environment variables from .env file
load_dotenv()

# Retrieve API keys from environment variables
zyte_api_key = os.getenv('ZYTE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')

# Initialize clients with the API keys
client_zyte = ZyteAPI(api_key=zyte_api_key)
client_openai = OpenAI(api_key=openai_api_key)

# Additional configuration
nest_asyncio.apply()
model = "gpt-3.5-turbo"

# Define temperature
temperature = 0

# Test API Key

In [5]:
# Test Zyte API key
client_zyte.get(
  {
    "url": "https://www.rumah123.com/jual/cari/?q=rumah%20jabodetabek",
    "httpResponseBody": True,
    "httpResponseHeaders": True,
    "product": True,
    "productOptions": {
        "extractFrom": "httpResponseBody"
    }
  }
)['statusCode']

200

In [6]:
# Test OpenAI API key
client_openai.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": "Hello"},
    ],
    temperature=0,
  ).choices[0].message.content

'Hello! How can I assist you today?'

# Create a function

In [7]:
def _auto_detect_encoding(body: bytes) -> Union[str, None]:
    for encoding in ["utf8", "cp1252"]:
        try:
            body.decode(encoding)
        except UnicodeError:
            continue
        return resolve_encoding(encoding)


def _bytes_to_html(body: bytes, headers: list[dict]) -> str:
    headers_dict = CIMultiDict([(h["name"], h["value"]) for h in headers])
    content_type = headers_dict.get("Content-Type")
    _, html = html_to_unicode(
        content_type,
        body,
        auto_detect_fun=_auto_detect_encoding,
        default_encoding="utf8",
    )
    return html


def _get_html(web_page: dict):
    try:
        return web_page["html"]
    except KeyError:
        body = base64.b64decode(web_page["httpResponseBody"])
        headers = web_page["httpResponseHeaders"]
        return _bytes_to_html(body, headers)


def get_html_with_zapi(url: str, browser=False) -> Union[str, None]:
  if browser:
    web_page = client_zyte.get(
      {
        "url": url,
        "browserHtml": True,
      }
    )
    return web_page['browserHtml']
  else:
    web_page = client_zyte.get(
      {
        "url": url,
        "httpResponseBody": True,
        "httpResponseHeaders": True,
      }
    )

    return _get_html(web_page)

In [8]:
# Utilities to extract using OpenAI API
def extract_gpt_unstructured(text: str, data_to_extract: str) -> ChatCompletion:
  """
  Extracts the data from `text` specified in plain text in `data_to_extract`.
  """
  instruction = f"""
Extract data from the following text or web page:

[TEXT START]

{text}

[TEXT END]

This is the data you must extract: {data_to_extract}

Note: Some requested data might not be available. Specify it explicitly in that case.
""".strip()

  completion = client_openai.chat.completions.create(
    model=model,
    messages=[
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": instruction},
    ],
    temperature=temperature,
  )

  return completion


def extract_gpt_structured_request_in_prompt(text: str, schema: dict) -> ChatCompletion:
  """
  Extracts the data from `text` specified in as a json schema in `schema`.
  Althought most of the time the extracted data will be parseable and schema-compliant, this method does not ensure so.
  """
  instruction = f"""
Extract data from the following text or web page:

[TEXT START]

{text}

[TEXT END]

Be sure to output only a json with the extraction.
The json with extracted data must be compliant with this json schema:

{json.dumps(schema, indent=4, ensure_ascii=False)}

If there's any value you cannot find, set it as null in the extraction json.
""".strip()

  completion = client_openai.chat.completions.create(
    model=model,
    messages=[
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": instruction},
    ],
    temperature=temperature,
  )

  return completion


def extract_gpt_structured(text: str, schema: BaseModel) -> ChatCompletion:
  """
  Extracts the data from `text` specified in as a pydantic model in `schema`.
  The response will always be parsable and schema-compliant.
  """
  instruction = f"""
Extract data from the following text or web page, according to the given schema.

Here's the text from which you have to extract the data:

[TEXT START]

{text}

[TEXT END]

If there's any value you cannot find, set as null in the extraction json.
""".strip()

  completion = client_openai.beta.chat.completions.parse(
      model=model,
      messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": instruction},
      ],
      temperature=temperature,
      response_format=schema,
  )

  return completion

# Scraping Link

In [None]:
# List untuk menyimpan data properti
property_links = []

# Looping untuk beberapa halaman
for current_page in range(1, 71):  # Mengambil data dari halaman 1 hingga 70, ubah range sesuai kebutuhan
    page_url = f"https://www.rumah123.com/jual/cari/?q=rumah+jabodetabek&page={current_page}"
    
    # Mengunduh halaman menggunakan requests
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(page_url, headers=headers)
    
    # Periksa status response
    if response.status_code == 200:
        # Parse HTML menggunakan BeautifulSoup
        page_soup = BeautifulSoup(response.text, 'html.parser')

        # Loop untuk setiap properti pada halaman
        for property_item in page_soup.find_all('div', {'class': 'card-featured__middle-section'}):
            property_info = {}

            # Mengambil title dan link dari tag <a>
            link_tag = property_item.find('a', {'title': True})  # Tag <a> dengan atribut 'title'
            if link_tag:
                property_info['property_title'] = link_tag.get('title')  # Judul properti
                property_info['property_url'] = link_tag.get('href')  # URL properti
            
            # Menyimpan data ke dalam list
            property_links.append(property_info)

        print(f"Page {current_page} processed successfully.")
    else:
        print(f"Failed to fetch page {current_page}. Status code: {response.status_code}")

    # Tidur sejenak sebelum melanjutkan ke halaman berikutnya
    sleep(1)

# Membuat dataframe dari list property_links
properties_df = pd.DataFrame(property_links)

# Simpan ke file CSV
properties_df.to_csv('link_properties.csv', index=False)
print("Data has been saved to properties.csv")

# Optimize Scraping Data

In [12]:
link123 = pd.read_csv('data/link_properties.csv')
link123['property_url'][0: 5]

0    /properti/jakarta-selatan/hos18392523/
1      /properti/jakarta-utara/hos16808954/
2              /properti/depok/hos18861745/
3              /properti/depok/hos18570241/
4      /properti/jakarta-utara/hos16543371/
Name: property_url, dtype: object

In [None]:
# Define the schema outside of the loop as it doesn't change
schema = {
    "title": {"type": "string", "description": "The title of the house"},
    "description": {"type": "string", "description": "The description of the house"},
    "price": {"type": "number", "description": "The price of the house"},
    "address": {"type": "string", "description": "The address of the house"},
    "city": {"type": "string", "description": "The city of the house"},
    "land_size_m2": {"type": "number", "description": "The landsize (LT) without m2 of the house, if there is NaN fill 0"},
    "building_size_m2": {"type": "number", "description": "The buildingsize (LB) without m2 of the house, if there is NaN fill 0"},
    "bedroom": {"type": "number", "description": "The number of bedroom in the house, if there is NaN fill 0"},
    "bathroom": {"type": "number", "description": "The number of bathroom in the house, if there is NaN fill 0"},
    "garage": {"type": "number", "description": "The number of garage in the house, only the number and string that means number, if there is NaN fill 0"},
    "carport": {"type": "number", "description": "The number of carport in the house if there is NaN fill 0"},
    "property_type": {"type": "string", "description": "The type of the property, only if property_type = house"},
    "certificate": {"type": "string", "description": "The certificate of the house, if there is Null fill Not Specified"},
    "voltage_watt": {"type": "number", "description": "The voltage without watt of the house, if there is Null fill Not Specified"},
    "maid_bedroom": {"type": "number", "description": "The number of maid bedroom in the house, if there is NaN fill 0"},
    "maid_bathroom": {"type": "number", "description": "The number of maid bathroom in the house, if there is NaN fill 0"},
    "kitchen": {"type": "number", "description": "The number of kitchen in the house, if there is NaN fill 0"},
    "dining_room": {"type": "number", "description": "The number of dining room in the house, if there is NaN fill 0"},
    "living_room": {"type": "number", "description": "The number of living room in the house, if there is NaN fill 0"},
    "furniture": {"type": "string", "description": "The number of furniture in the house", "enum": ["Semi Furnished", "Furnished", "Unfurnished"]},
    "building_material": {"type": "string", "description": "The number of building material in the house"},
    "floor_material": {"type": "string", "description": "The number of building material in the house"},
    "floor_level": {"type": "number", "description": "The number of floor level in the house, if there is NaN fill 0"},
    "house_facing": {"type": "string", "description": "The number of face of the house", "enum": ["North", "South", "East", "West", "Southeast", "Southwest", "Northeast", "Northwest"]},
    "concept_and_style": {"type": "string", "description": "The concept and style of the house"},
    "view": {"type": "string", "description": "The view from the house"},
    "internet_access": {"type": "string", "description": "Whether the house has internet access"},
    "road_width": {"type": "string", "description": "The road width in front of the house"},
    "year_built": {"type": "number", "description": "The year the house was built"},
    "year_renovated": {"type": "number", "description": "The year the house was last renovated"},
    "water_source": {"type": "string", "description": "The water source for the house"},
    "corner_property": {"type": "boolean", "description": "Whether the house is a corner property (hook)"},
    "property_condition": {"type": "string", "description": "The condition of the property"},
    "ad_type": {"type": "string", "description": "The type of advertisement for the property"},
    "ad_id": {"type": "string", "description": "The ID of the advertisement"}
}

# Initialize container for valid JSON
list_container = []

# List of URLs to scrape
property_urls = [
    f"https://www.rumah123.com{i}" for i in link123['property_url'][0: 5]
]

# Loop through the property URLs to process each one
for idx, url in enumerate(property_urls):
    try:
        print(f"Processing index-{idx} with link {url}")

        # Extract HTML with ZAPI
        html = get_html_with_zapi(url, browser=False)

        # Extract plain text from the HTML
        text = html_text.extract_text(html, guess_layout=True)

        # Extract structured data from the text using the schema
        completion = extract_gpt_structured_request_in_prompt(text=text, schema=schema)

        # Retrieve and clean the content from the completion result
        completion_text = completion.choices[0].message.content.strip()

        # Append the cleaned content to the list
        list_container.append(completion_text)

        # Save progress to a temporary file every 5 iterations
        if idx % 2 == 0 or idx == len(property_urls) - 1:
            with open("temp_data.json", "w") as f:
                json.dump(list_container, f)
            print(f"Progress saved at index-{idx}.")

    except Exception as e:
        # Log errors
        print(f"Error processing index-{idx}: {e}")

    # Optional: Add delay to avoid server blocking
    sleep(1)

# Convert the list of strings to valid JSON and load them into a list
valid_json = []
for json_str in list_container:
    try:
        # Ensure proper JSON format (removing unwanted characters and handling malformed input)
        clean_json = json_str.replace('`', '').replace("\n", "").replace('json', '')
        valid_json.append(json.loads(clean_json))
    except json.JSONDecodeError:
        print(f"Error decoding JSON: {json_str}")

# Convert the list of valid JSON into a DataFrame
df = pd.DataFrame(valid_json)

# Save the final DataFrame to a CSV file
df.to_csv('data_harga_rumah123.csv', index=False)
print("Data has been saved to data_harga_rumah123.csv.")