In [63]:
from dotenv import load_dotenv
from googleapiclient.discovery import build
import os
import json
from collections import Counter

load_dotenv(".env")

os.makedirs("output", exist_ok=True)

In [8]:
with open("queries/search_queries.json", "r") as f:
    queries = json.load(f)
queries_list = [{"query": q, "category": cat} for cat in queries for q in queries[cat]]

In [5]:
service = build("customsearch", "v1", developerKey=os.environ["GOOGLE_API_KEY"])
google_search = service.cse()

In [7]:
def format_item(item: dict):
    item.pop("kind")
    item.pop("htmlFormattedUrl")
    pagemap = item.get("pagemap", {})
    metatags = pagemap.get("metatags", [])
    item["metatags"] = metatags
    if pagemap:
        item.pop("pagemap")
    return item

In [10]:
url_items = []

for i, query_d in enumerate(queries_list):
    query = query_d["query"]
    category = query_d["category"]
    print(f"Searching for {query}, {i}/{len(queries_list)}")

    res = google_search.list(
        q=query,
        cx=os.environ["GOOGLE_CSE_ID"],
        num=10,
        hl="en",
        # cr="countryRU|countryUS",
        # lr="lang_ru|lang_en",
        # gl="ru",
        dateRestrict="w2",
    ).execute()

    items = res.get("items", [])
    if len(items) == 0:
        print(f"No results for {query}")
        continue
    for item in items:
        item = format_item(item)
        url_items.append({
            "url": item["link"],
            "query": query,
            "category": category,
            "item": item,
        })

Searching for Latest products in the steel industry, 0/69
Searching for New initiatives by leading steel companies, 1/69
Searching for Competitor analysis steel industry, 2/69
Searching for Innovations in steel production, 3/69
Searching for New steel products market trends, 4/69
Searching for Innovative data storage solutions for the steel industry, 5/69
Searching for Data management trends in metallurgy, 6/69
Searching for Best practices for data storage in manufacturing, 7/69
Searching for Cloud solutions for steel manufacturing, 8/69
Searching for Big data in steel industry, 9/69
Searching for AI applications in steel industry, 10/69
Searching for Artificial intelligence in metallurgy, 11/69
Searching for Top startups in steel and metallurgy, 12/69
Searching for Innovative AI solutions for manufacturing, 13/69
Searching for Machine learning in steel production, 14/69
Searching for New regulations in the steel industry, 15/69
Searching for Legislative changes in metallurgy, 16/69
Se

In [13]:
# Save URLs
with open("output/urls.json", "w") as f:
    json.dump(url_items, f, indent=2, ensure_ascii=False)

In [17]:
og_types = set()
for url in url_items:
    for mt in url["item"]["metatags"]:
        og_type = mt.get("og:type")
        if og_type:
            og_types.add(og_type)

In [18]:
og_types

{'Article',
 'Journal',
 'Journal Issue',
 'WebPage',
 'Website',
 'activity',
 'article',
 'articles',
 'company',
 'event_ticketing:event',
 'king:photo',
 'product',
 'profile',
 'video.other',
 'website'}

## Filter and Count Duplicates

In [58]:
def filter_urls(urls):
    # filter by og:type
    urls_filtered = []
    for url in urls:
        for mt in url["item"]["metatags"]:
            og_type = mt.get("og:type")
            if og_type and og_type.lower().startswith("article"):
                urls_filtered.append(url)
                break
    return urls_filtered


def deduplicate_urls(url_items):
    urls_dedupl = {}

    urls = [item['url'] for item in url_items]
    counter = Counter(urls)
    for item in url_items:
        url = item['url']
        if urls_dedupl.get(url):
            continue
        urls_dedupl[url] = item
        urls_dedupl[url]['count'] = counter[url]

    assert len(urls_dedupl) == len(counter)
    return list(urls_dedupl.values())

In [24]:
url_items = filter_urls(url_items)
url_items = deduplicate_urls(url_items)

with open("output/urls_filtered.json", "w") as f:
    json.dump(url_items, f, indent=2, ensure_ascii=False)

In [62]:
len(url_items)

206