## this is to crawl all the seperate links


In [9]:
import time
import csv
from datetime import datetime

import requests 
import bs4 as bs 
import urllib.request
import sqlite3
import re


In [166]:
# Create the table
conn = sqlite3.connect("scraped_data.db")
cursor = conn.cursor()

cursor.execute("""
CREATE TABLE IF NOT EXISTS reviews (
    ID TEXT PRIMARY KEY,
    URL TEXT,
    Scraped_at TEXT,
    Author TEXT,         
    Posted TEXT,
    Modified TEXT,
    Title TEXT,
    Subtitle TEXT,
    Score INTEGER,
    Verdict TEXT,
    Review_Text TEXT,
    Long_Text TEXT, 
    Quotes TEXT           
)
""")

conn.close()

In [2]:
def save_data(data): 
    conn = sqlite3.connect("scraped_data.db")
    cursor = conn.cursor()

    cursor.execute("""
    INSERT INTO reviews (URL, ID, Scraped_at, Author, Posted, Modified, Title, Subtitle, Score, Verdict, Review_Text, Long_Text, Quotes)
    VALUES (:URL, :ID, :Scraped_at, :Author, :Posted, :Modified, :Title, :Subtitle, :Score, :Verdict, :Review_Text, :Long_Text, :Quotes)
    """, data)

    conn.commit() 

In [7]:

def get_data(row, soup):
    

    errors = []

    def safe_extract(find_func, keyword, *args):
        try:
            element = find_func(*args)
            if not element:
                raise ValueError("Element not found")
            return element.text.strip()
        except Exception:
            errors.append(f"{keyword} not found")
            return None
        
    def extract_multiple(find_func, keyword, tag, attrs=None, strip_chars=""):
        try:
            elements = find_func(tag, attrs) if attrs else find_func(tag)
            if not elements:
                if keyword == 'Long_Text': # problem with old ign sites
                    article_section = soup.find("section", class_="article-page")
                    # Extract only direct text (ignoring nested elements)
                    elements = " ".join(article_section.find_all(string=True, recursive=False)).strip()
                    if not elements:     
                        raise ValueError("Elements not found")
                    return elements
                if not elements:     
                    raise ValueError("Elements not found")
            return ";".join(e.text.strip().rstrip(strip_chars) for e in elements)
        except Exception:
            errors.append(f"{keyword} not found")
            return None
        
    def safe_extract_meta(soup, property_name, keyword):
        try:
            tag = soup.find("meta", {"property": property_name})
            if tag and "content" in tag.attrs:
                return tag["content"].strip()
            raise ValueError(f"{property_name} not found")
        except Exception:
            errors.append(f"{keyword} not found")
            return None

    data_dict = {
        "URL": row['link'],
        "ID": row['id'],
        "Scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "Author": safe_extract(soup.find, "Author", "a", {"data-cy": "article-author-details"}),
        "Review_Text": safe_extract(soup.find, "Review_Text", "div", {"data-cy": "article-subtitle"}),
        "Posted": safe_extract_meta(soup, "article:published_time", "Posted"),
        "Modified": safe_extract_meta(soup, "article:modified_time", "Modified"),
        "Long_Text": extract_multiple(soup.find_all, "Long_Text", "p", {"class": "paragraph jsx-2269604527", "data-cy": "paragraph"}),
        "Score": safe_extract(soup.find, "Score", "figure", {"class": "review-score"}),
        "Title": safe_extract(soup.find, "Title", "h1"),
        "Subtitle": safe_extract(soup.find, "Subtitle", "h2"),
        "Verdict": re.sub(r'^Verdict[\s]*', '', safe_extract(soup.find, "Verdict", "div", {"data-cy": "verdict"})) if safe_extract(soup.find, "Verdict", "div", {"data-cy": "verdict"}) else None,
        "Quotes": extract_multiple(soup.find_all, "Quotes", "div", {'data-cy': 'quoteBox'}, strip_chars='.“')
    }

    if errors:
        with open("scrape_errors.csv", "a", newline="") as f:
            writer = csv.writer(f)
            writer.writerow([row['link'], row['id'], ", ".join(errors)])

    return data_dict

In [14]:
# CSV auslesen und iterierenPosted
with open('./reviews.csv', 'r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)

    session = requests.Session()

    # Optionally, set headers to make your requests look more like a real browser
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    session.headers.update(headers)

    i = 0

    for row in reader:
        print("scraped link:" + row['link'])
        response = session.get(row['link'])

        source = response.text
        soup = bs.BeautifulSoup(source,'html.parser')
        
        data = get_data(row, soup)
        save_data(data)
        if i%8 == 0: 
            time.sleep(0) 
        i += 1
        print('finished' + str(i))

scraped link:https://www.ign.com/articles/mass-effect-3-legendary-edition-review
finished1
scraped link:https://www.ign.com/articles/shin-megami-tensei-v-review
finished2
scraped link:https://www.ign.com/articles/riders-republic-review
finished3
scraped link:https://www.ign.com/articles/unpacking-review
finished4
scraped link:https://www.ign.com/articles/mario-party-superstars-review
finished5
scraped link:https://www.ign.com/articles/marvels-guardians-of-the-galaxy-game-review
finished6
scraped link:https://www.ign.com/articles/age-of-empires-4-review
finished7
scraped link:https://www.ign.com/articles/the-riftbreaker-review
finished8
scraped link:https://www.ign.com/articles/the-dark-pictures-anthology-house-of-ashes-review
finished9
scraped link:https://www.ign.com/articles/back-4-blood-review
finished10
scraped link:https://www.ign.com/articles/far-cry-6-review
finished11
scraped link:https://www.ign.com/articles/pathfinder-wrath-of-the-righteous-review
finished12
scraped link:http

In [15]:
# Test Database
conn = sqlite3.connect("scraped_data.db")
cursor = conn.cursor()

cursor.execute("SELECT * FROM reviews Where Author='Matt Thrower'")
rows = cursor.fetchall()

print("Database contents:")
for row in rows:
    print(row)

conn.close()

Database contents:
('aad9a4c3-8a9b-4789-b239-8d6872da6b7e', 'https://www.ign.com/articles/arcs-board-game-review', '2025-03-30 15:20:27', 'Matt Thrower', '2024-11-13T15:10:01.708Z', '2024-11-13T15:20:56.240Z', 'Arcs Board Game Review', 'A deviously deep game of space conquest.', 10, 'Arcs is not the first game to try and balance challenging strategic elements with the classic fun of negotiation and dice-rolling, but it’s a very tough task to get it right. Those tendencies are in opposition: if you can win through luck or charm, it can feel like the planning aspect gets devalued. This, however, is the closest a game has yet come to hitting it right on the nose. It’s an approachable game, but its tactical aspects are novel, labyrinthine and beautifully circular, a maze which, once mastered, allows you to build a platform on which dice and diplomacy can give you a slight edge. It’s an awesome thing to behold, carving a story arc of its own right through the annals of board game design.', 

In [None]:
#test cell
source = urllib.request.urlopen('https://www.ign.com/articles/overwatch-review-2020-update')

soup = bs.BeautifulSoup(source,'html.parser')

article_section = soup.find("section", class_="article-page")

# Extract only direct text (ignoring nested elements)
direct_text = " ".join(article_section.find_all(string=True, recursive=False)).strip()

print(direct_text)



After four years and one IGN Game of the Year Award, Overwatch has evolved into a multiplayer shooter that remains at the top of the class. It’s a dizzying amalgam of unique character design, stunningly realised style, and compellingly dynamic action. Minutes turn into hours as you’re caught up in round after magically exciting round, surrounded by gorgeously crafted maps packed with detail and charm. Overwatch, simply put, is the most fun I’ve ever had playing a video game. 
 Overwatch’s gameplay has remained almost entirely unchanged since launch and centres around taking and controlling points on the map or escorting payloads from one end of them to the other, all at the expense of the enemy team’s health bars. It’s a simple setup and not an altogether original one, but it’s the nuance found in how you go about winning each match that makes Overwatch so brilliant. Each team of six can be stitched together from the current pool of 32 heroes. Not only does each play differently and br

In [174]:
conn = sqlite3.connect('scraped_data.db')
cursor = conn.cursor()
conn.commit()
conn.close()