In [60]:
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import requests
import os
import json
from main import *

In [3]:
load_dotenv()

serper_api_key = os.getenv("SERPER_API_KEY")
scraperapi_api_key = os.getenv("SCRAPERAPI_API_KEY")

---

# Find content on a topic

In [4]:
# Function for searching websites.
def perform_serper_search(
    query,
    api_key,
    geo="us",
    lang="en",
    top_results=5,
):
    "https://serper.dev/"

    endpoint = "https://google.serper.dev/search"
    headers = {"X-API-KEY": api_key, "Content-Type": "application/json"}
    payload = {
        "q": query,
        "gl": geo,
        "hl": lang,
        "num": top_results,
    }

    response = requests.post(endpoint, json=payload, headers=headers)
    return response.json()

In [12]:
# Entry data.
query = "Pizza service in Mongolia"

# Searching websites -> List of lists of libraries.
response = perform_serper_search(query, serper_api_key)

In [6]:
# Deviding list into another ones according to categories.
search_params = response.get("searchParameters")
results = response.get("organic")
locations = response.get("places")
related_queries = response.get("relatedSearches")

In [None]:
results

---

# Extract content from the sources

In [11]:
# Function for getting data from website.
def scrape_url(url, api_key):
    scraperapi_url = f"http://api.scraperapi.com/?api_key={api_key}&url={url}&render=true"
    response = requests.get(scraperapi_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        return soup.get_text()

    else:
        print(f"Failed to fetch {url}, status code: {response.status_code}")

In [19]:
### M
# Making an array out of urls.
urls = []
for result in results:
    urls.append(result['link'])

In [43]:
url = "https://www.reddit.com/r/PizzaCrimes/comments/weczmj/two_pizzas_from_this_delivery_place_in_mongolia/"

# Getting data from the website.
content = scrape_url(url, scraperapi_api_key)

In [None]:
content_raw_cleaned = "\n".join([line for line in content.splitlines() if line])
print(content_raw_cleaned)

In [57]:
section_above_comments = content_raw_cleaned.split("ago")[0]
section_below_comments = content_raw_cleaned.split("More posts you may like")[-1]

In [58]:
comment_section = content_raw_cleaned.replace(section_above_comments, "").replace(section_below_comments, "")

In [61]:
print(comment_section)

ago 
MattFromChina
      ADMIN
    
      MOD
    
       Two pizzas from this delivery place in Mongolia.
    
    Identity theft
  
 
 
 
ForeverEntert
•
Promoted
THE HOUSE OF THE DEAD 2: Remake is coming to Nintendo Switch, PC, Xbox, and PlayStation in Spring 2025!
View More 
      store.steampowered.com
    
        Collapse video player
      
Add a comment
                Sort by:
              
Best
Open comment sort options
Best
Top
New
Controversial
Old
Q&A
      QualityVote
    
•
 3y ago 
    Hi! This is our community moderation bot.
  
    If this post is a pizza crime , UPVOTE this comment!!
  
    If this post is innocent, DOWNVOTE This comment!
  
    If this post breaks the rules, DOWNVOTE this comment and REPORT the post!
  
 Reply 
          reply
        
      Chainsaws_n_meth
    
•
 3y ago 
  
 Reply 
          reply
        
[deleted]
•
 3y ago 
    This is a strange "pizza," but I think I will withhold judgement. It looks pretty good, atleast the presentation is

In [77]:
message =[{"role": "user", "content": f"This is html section with comments of reddit. Analyze text and provide list of only the string comments as JSON wrapped in the key 'comments': {comment_section}"}]
structured_comments = call_gpt(message, use_json=True)
print(structured_comments)

{
  "comments": [
    "Two pizzas from this delivery place in Mongolia.",
    "This is a strange 'pizza,' but I think I will withhold judgement. It looks pretty good, atleast the presentation is nice. This is something I would have to try myself to truly know if it's worthy of atleast an honory pizza title.",
    "The flavor combo in the first one is intriguing. I’m not sold on the presentation/dish but I would try something else with that combination of ingredients. Pomegranates+walnuts (satsivi is walnut-based)+cheese+tomato+pepper, sounds kinda pimento-ish. it’s hitting a lot of different, complementary taste buds.",
    "Walnut allergics PAY ATTENTION 🚨",
    "Perhaps the first one resembles Mongolian food? I have no idea…",
    "Not exactly Mongolian style. Was thinking it’s maybe something from the Caucasus region or Georgia.",
    "Those middle layers of…dough?…look entirely raw.",
    "Pomegranate wtf",
    "I’m horrified in a pleased way to see what other cultures develop base

In [78]:
comments = json.loads(structured_comments).get("comments")
comments

['Two pizzas from this delivery place in Mongolia.',
 "This is a strange 'pizza,' but I think I will withhold judgement. It looks pretty good, atleast the presentation is nice. This is something I would have to try myself to truly know if it's worthy of atleast an honory pizza title.",
 'The flavor combo in the first one is intriguing. I’m not sold on the presentation/dish but I would try something else with that combination of ingredients. Pomegranates+walnuts (satsivi is walnut-based)+cheese+tomato+pepper, sounds kinda pimento-ish. it’s hitting a lot of different, complementary taste buds.',
 'Walnut allergics PAY ATTENTION 🚨',
 'Perhaps the first one resembles Mongolian food? I have no idea…',
 'Not exactly Mongolian style. Was thinking it’s maybe something from the Caucasus region or Georgia.',
 'Those middle layers of…dough?…look entirely raw.',
 'Pomegranate wtf',
 'I’m horrified in a pleased way to see what other cultures develop based on the pizza concept. Truly it is the food 

---

# Feed content into paper

In [2]:
# ...