In [1]:
# CHECK IF YOU ARE STILL BLOCKED BY GOOGLE 12PM APR 3 (1-24 HOURS IF TEMPORARY, SEVERAL DAYS TO WEEKS IF AGGRESSIVE, RARE PERMANENT BLOCK)
# SEE IF YOU CAN USE THE SAME API KEY LATER ON, IF NOT GENERATE A NEW ONE WITH SCRAPINGDOG (200 API KEYS AVAILABLE IN THE NEXT MONTH)
# 1 API KEY CAN SCRAPE ~4300 RESULTS BEFORE GETTING BLOCKED

import requests
import time
import random

# Configuration
API_KEY = "67e973fad9a23c3a9fd5e970"  # Replace with your actual key
BASE_URL = "https://api.scrapingdog.com/google"
TEST_QUERY = "weather today"  # Generic test query

def check_google_block():
    """Check if Google is blocking the API key with a minimal test request"""
    params = {
        "api_key": API_KEY,
        "query": TEST_QUERY,
        "results": 1,  # Only 1 result to reduce footprint
        "country": "us",
        "advance_search": "false"
    }

    try:
        # Add slight random delay
        time.sleep(random.uniform(1, 3))
        
        response = requests.get(BASE_URL, params=params, timeout=10)
        
        if response.status_code == 200:
            data = response.json()
            if data.get('organic_results'):
                print("✅ Success! You're NOT blocked. Sample result:")
                print(f"Title: {data['organic_results'][0]['title']}")
                print(f"Link: {data['organic_results'][0]['link']}")
            else:
                print("⚠️ API working but no results returned (possible soft block)")
        elif response.status_code == 403:
            print("❌ BLOCKED: Google detected scraping (HTTP 403)")
        elif response.status_code == 429:
            print("⏳ RATE LIMITED: Too many requests (HTTP 429)")
        else:
            print(f"⚠️ Unexpected status: {response.status_code} | Response: {response.text[:200]}")

    except Exception as e:
        print(f"🚨 Connection failed: {str(e)}")
        print("This could indicate a block or network issue")

if __name__ == "__main__":
    print("Running Google block check...")
    check_google_block()

Running Google block check...
❌ BLOCKED: Google detected scraping (HTTP 403)


In [6]:
# LOAD

import requests
import pandas as pd
import csv
import os
from datetime import datetime
from bs4 import BeautifulSoup
import time
import random

In [None]:
# INTERNATIONAL GENERAL (DONE - FILTER IN POST)

# Configuration
API_KEY = "67e973fad9a23c3a9fd5e970"
BASE_URL = "https://api.scrapingdog.com/google"
OUTPUT_FILE = os.path.expanduser("~/Desktop/trial1.csv")
QUERIES = ["climate database",
         "socioeconomic statistics",
         "climate vulnerability assessment",
         "climate risk model",
         "climate fund",
         "climate adaptation framework",
         "climate policy",
         "climate technology",
         "climate innovation research",
        "climate resilience building",
        "climate adaptation and mitigation strategies",
        "climate strategy monitoring",
        "climate strategy evaluation",
        "climate resilience program",
        "climate adaptation plan"]

def is_duplicate(result, existing_urls, existing_titles):
    """Check if result is a duplicate based on URL and title"""
    url = result.get('link', '').lower().strip()
    title = result.get('title', '').lower().strip()
    
    # Remove common tracking parameters from URLs
    clean_url = url.split('?')[0].split('#')[0]
    
    return (clean_url in existing_urls) or (title in existing_titles)

def fetch_and_save_results():
    """Fetch Google results, remove duplicates, and save to CSV"""
    all_results = []
    seen_urls = set()
    seen_titles = set()
    
    for query in QUERIES:
        print(f"\nSearching for: '{query}'")
        
        for page in range(10):  # Pages 0-9
            params = {
                "api_key": API_KEY,
                "query": query,
                "results": 100,
                "country": "us",
                "page": page,
                "advance_search": "false"
            }
            
            try:
                response = requests.get(BASE_URL, params=params, timeout=15)
                response.raise_for_status()
                data = response.json()
                organic_results = data.get('organic_results', [])
                
                new_results = 0
                for result in organic_results:
                    if not is_duplicate(result, seen_urls, seen_titles):
                        url = result.get('link', '').lower().strip()
                        clean_url = url.split('?')[0].split('#')[0]
                        title = result.get('title', '').lower().strip()
                        
                        seen_urls.add(clean_url)
                        seen_titles.add(title)
                        
                        all_results.append({
                            'query': query,
                            'rank': result.get('rank'),
                            'title': result.get('title', '').strip(),
                            'link': url,
                            'snippet': result.get('snippet', '').replace('\n', ' ').strip(),
                            'page': page + 1
                        })
                        new_results += 1
                
                print(f"Page {page + 1}: {len(organic_results)} results | {new_results} new")
                
            except Exception as e:
                print(f"Error on page {page + 1}: {str(e)}")
                continue
    
    # Save to CSV
    if all_results:
        fieldnames = ['query', 'rank', 'title', 'link', 'snippet', 'page']
        
        with open(OUTPUT_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_results)
            
        print(f"\nSuccess! Saved {len(all_results)} unique results to {OUTPUT_FILE}")
    else:
        print("No results to save")

if __name__ == "__main__":
    print(f"Starting Google searches for {len(QUERIES)} queries")
    fetch_and_save_results()

In [12]:
# COUNTRY-SPECIFIC (RE-TRY LATER - ADDED TIME DELAY AND HEADERS, CONSIDER ROTATING PROXIES (VPN?) AND HEADERS)

import requests
from bs4 import BeautifulSoup
import time
import random
import pandas as pd

def extract_google_results(html_content):
    """Extract titles, links, descriptions, and website titles from Google search results."""
    soup = BeautifulSoup(html_content, 'html.parser')
    results = []
    
    # Find all search result items (Google's structure)
    result_items = soup.find_all('div', class_='tF2Cxc')  # Updated for Google's current structure
    
    for item in result_items:
        try:
            # Extract title and link
            title_tag = item.find('h3', class_='LC20lb')
            title = title_tag.get_text() if title_tag else None
            link = item.find('a')['href'] if item.find('a') else None
            
            # Extract description (Google's snippet)
            description_tag = item.find('div', class_='VwiC3b')
            description = description_tag.get_text() if description_tag else None
            
            # Extract website title (Google's URL display)
            cite_tag = item.find('cite')
            website_title = cite_tag.get_text() if cite_tag else None
            
            if title and link:  # Only add if we have basic info
                results.append({
                    'title': title.strip(),
                    'link': link,
                    'description': description.strip() if description else None,
                    'website_title': website_title.strip() if website_title else None
                })
        except Exception as e:
            print(f"Error processing result: {e}")
            continue
    
    return results

def scrape_google_search(query, country_code="ph", pages=2):
    """Scrape Google search results for a query with country-specific results."""
    base_url = "https://www.google.com/search"
    all_results = []
    
    for page in range(pages):
        try:
            # Google pagination uses 'start' parameter (10 results per page)
            start = page * 10
            
            params = {
                "q": query,
                "gl": country_code,  # Country code (e.g., "us", "uk", "ca")
                "start": start
            }
            
            headers_list = [
                {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"},
                {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"},
                {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"}
            ]
            
            headers = random.choice(headers_list)

            # proxy_list = [
            #    "http://123.456.789.1:8080",
            #    "http://123.456.789.2:8080",
            #]
            
            # proxy = {"http": random.choice(proxy_list), "https": random.choice(proxy_list)}
            # response = requests.get(base_url, headers=headers, proxies=proxy)
            
            print(f"Scraping page {page + 1} for query: '{query}' (Country: {country_code.upper()})...")
            response = requests.get(base_url, headers=headers, params=params)
            response.raise_for_status()
            
            page_results = extract_google_results(response.text)
            for result in page_results:
                result['query'] = query  # Add the search query to each result
                result['country'] = country_code.upper()  # Add country code
            all_results.extend(page_results)
            
            # Random delay to avoid rate limiting
            time.sleep(random.uniform(2, 5))
            
        except Exception as e:
            print(f"Error scraping page {page + 1}: {e}")
            continue
    
    return all_results

if __name__ == "__main__":
    # List of queries to search
    queries = [
        "administrative service",
          "natural disaster",
          "disaster relief",
          "aid delivery",
          "government protection",
          "capacity building program",
          "resilience training",
          "local adaptation efforts",
          "municipal policy",
          "weather warning",
          "rural improvement",
          "regional law",
          "provincial office",
          "official statistics",
          "climate report"
    ]
    
    # Country code (e.g., "us", "uk", "in", "au")
    country = "ph"
    
    # Scrape 10 pages per query
    all_results = []
    for query in queries:
        search_results = scrape_google_search(query, country_code=country, pages=2)
        all_results.extend(search_results)
    
    # Convert to DataFrame and save as CSV
    df = pd.DataFrame(all_results)
    df.to_csv("philippines.csv", index=False)
    print(f"\nTotal results collected: {len(df)}")
    print("Saved to 'philippines.csv'")

Scraping page 1 for query: 'administrative service' (Country: PH)...
Scraping page 2 for query: 'administrative service' (Country: PH)...
Scraping page 1 for query: 'natural disaster' (Country: PH)...
Scraping page 2 for query: 'natural disaster' (Country: PH)...
Scraping page 1 for query: 'disaster relief' (Country: PH)...
Scraping page 2 for query: 'disaster relief' (Country: PH)...
Scraping page 1 for query: 'aid delivery' (Country: PH)...
Scraping page 2 for query: 'aid delivery' (Country: PH)...
Scraping page 1 for query: 'government protection' (Country: PH)...
Scraping page 2 for query: 'government protection' (Country: PH)...
Scraping page 1 for query: 'capacity building program' (Country: PH)...
Scraping page 2 for query: 'capacity building program' (Country: PH)...
Scraping page 1 for query: 'resilience training' (Country: PH)...
Scraping page 2 for query: 'resilience training' (Country: PH)...
Scraping page 1 for query: 'local adaptation efforts' (Country: PH)...
Scraping pa