# GPT Use Case 1: Web-scraping and content summarization

Import libraries and declare API keys

In [None]:
import openai
import os
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd

GPT_MODEL = "gpt-3.5-turbo-0613" # select model based on application
openai.api_key = API_key = 'sk-...'

Create functions for web-scraping and GPT execution

In [None]:
def web_scraping(url: str) -> list[str]:
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    list_contents = [contents.get_text() for contents in soup.find_all(string=True)] # access all strings existed within a website
    return list_contents

In [None]:
def run_prompt(prompt: str) -> str:
    response = openai.ChatCompletion.create(
        model=GPT_MODEL,
        messages=[
            {'role': 'user', 'content': prompt}
        ],
        temperature=0
    )
    result = response['choices'][0]['message']['content']
    return result

In [None]:
def create_summarize_template(strings: str) -> str:
    summarize_template = (\
        """You have no previous memories.\n"""
        """You are professional in summarizing contents with rational logic.\n"""
        """Your task is to summarize provided web-content into a well-structured list of information.\n"""
        f"""web-content: {strings}\n"""
        """Please return the response in JSON schema with the following keys:\n"""
        """'title', 'description', 'price', 'mileage', 'brand', 'model', 'year', 'color', 'fuel_type', 'transmission', 'location', 'contact'\n"""
    )
    return summarize_template

Web-scraping and run GPT

In [None]:
# Create a list of web urls
list_url = [\
    'https://rod.kaidee.com/product-368200692',
    'https://rod.kaidee.com/product-368040520',
    'https://rod.kaidee.com/product-367246566']
    
# Declare path for result export
export_path = './1_GPT_web_scraping_and_summarize_report'
csv_path = os.path.join(export_path, 'web_content_summary.csv')

# Clear existed csv file
if os.path.exists(csv_path):
    os.remove(csv_path)

# Extract contents from url and run GPT
for url in list_url:
    contents = web_scraping(url)
    summarize_template = create_summarize_template(contents)
    result = run_prompt(summarize_template)

# Format response into DataFrame
    summarized_content = json.loads(result)
    summarized_df = pd.DataFrame.from_dict(summarized_content.items())

    empty_row = pd.DataFrame(None, index=[0]) # Append empty row
    original_url = pd.DataFrame({'url': url}, index=[0]) # Append url of the response

    temp_summarized_df = pd.concat([original_url,summarized_df.loc[:]]).reset_index(drop=True)
    final_summarized_df = pd.concat([empty_row,temp_summarized_df.loc[:]]).reset_index(drop=True)

# Export to csv file
    final_summarized_df.to_csv(csv_path, mode='a', encoding='utf-8-sig', index=False, header=False)

# Generate web-scraping code using LLM

In [None]:
code_generating_prompt = (
    """Write python code for web-scraping to find all strings using BeautifulSoup4 library"""
    """Use urllib to access website with headers"""
    """Declare url to scrap as 'https://rod.kaidee.com/product-368200692'"""
)

response = openai.ChatCompletion.create(
    model=GPT_MODEL,
    messages=[
        {'role': 'user', "content": code_generating_prompt}
    ],
    temperature=0
)

generated_command = response['choices'][0]['message']['content']
print(generated_command)