# Part 1: Download Wikipedia Pages

In [1]:
import requests, json, re, networkx as nx, os
from bs4 import BeautifulSoup

# Wikipedia API endpoint
url = "https://en.wikipedia.org/w/api.php"

# Define parameters
params = {
    "action": "query",
    "titles": "List_of_mainstream_rock_performers",
    "prop": "revisions",
    "rvprop": "content",
    "format": "json",
    "rvslots": "main",
}

# Set custom header (optional but polite)
headers = {"User-Agent": "MyWikipediaClient/1.0 (example@example.com)"}

# GET request
response = requests.get(url, headers=headers, params=params)

# Query URL with params
query = response.url
print("Query URL:", query)

# Check status code
status = response.status_code
# print("Status code:", status)
print("Query:", response)

# Page data
content = response.content
# print(content)

# Convert response to JSON
data = response.json()

# Pretty-print JSON
# print("JSON:")
# print(json.dumps(data, indent=2))

# Scrape web page with BeautifulSoup
# soup = BeautifulSoup(response.content, "html.parser")

# Display scraped data
# print(soup.prettify())
# print(list(soup.children))

# Page data
# pages = content["query"]["pages"]
# page = next(iter(pages.values()))
# content = page["revisions"][0]["*"]

# Wikitext
page = next(iter(data["query"]["pages"].values()))
# print(page)
wikitext = page["revisions"][0]["slots"]["main"]["*"]
# print(wikitext)

# Regex to extract artist names from wiki links
artists = re.findall(r"\[\[([^\]|#]+)", wikitext)

# Remove duplicates and sort
artists = sorted(set(artists))


# Number of matches and first 20
print("Number of matches:", len(artists))
print(artists[:20])

# Save to file
with open("rock_artists.txt", "w", encoding="utf-8") as f:
    f.write(wikitext)

Query URL: https://en.wikipedia.org/w/api.php?action=query&titles=List_of_mainstream_rock_performers&prop=revisions&rvprop=content&format=json&rvslots=main
Query: <Response [200]>
Number of matches: 492
['10 Years (band)', '10cc', '3 Doors Down', '311 (band)', '38 Special (band)', 'A Perfect Circle', 'ABBA', 'AC/DC', 'AFI (band)', 'Accept (band)', 'Adam Ant', 'Aerosmith', 'Air Supply', 'Alanis Morissette', 'Alice Cooper', 'Alice Cooper (band)', 'Alice in Chains', 'AllMusic', 'Alter Bridge', 'Ambrosia (band)']


In [2]:
print(f"Found {len(artists)} artists")

# Create output directory
os.makedirs("rock_artists", exist_ok=True)

# Excludes non-artist wikilinks
exclude_prefixes = (
    "Category:", "Template:", "File:", "Wikipedia:",
    "Help:", "Portal:", "Talk:", "Book:", "Draft:"
)

manual_excludes = [
    "AllMusic", "Rolling Stone", "rock music", "Heavy Metal", 
    "Punk Rock", "Alternative Rock", "Classic Rock", "singles",
    "album charts", "streaming", "downloads", "airplay"
]
artist = [
    link for link in artists
    if link not in manual_excludes and not link.startswith(exclude_prefixes)
]

# Loop through artists and download their wikitext
for i, artist in enumerate(artists):
    # Replace spaces with underscores for API titles
    artist_title = artist.replace(" ", "_")

    params = {
        "action": "query",
        "titles": artist_title,
        "prop": "revisions",
        "rvprop": "content",
        "format": "json",
        "rvslots": "main",
    }

    try:
        res = requests.get(url, headers=headers, params=params)
        res.raise_for_status()
        artist_data = res.json()

        # Extract wikitext
        page = next(iter(artist_data["query"]["pages"].values()))
        if "revisions" not in page:
            print(f"Skipping {artist} (no revisions found)")
            continue

        text = page["revisions"][0]["slots"]["main"]["*"]

        # Save to file
        safe_name = re.sub(r'[\\/:"*?<>|]+', "_", artist_title)
        file_path = os.path.join("rock_artists", f"{safe_name}.txt")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(text)

        print(f"Saved {artist} ({i+1}/{len(artists)})")

    except Exception as e:
        print(f"Error processing {artist}: {e}")
        continue

print("Done")

Found 492 artists
Saved 10 Years (band) (1/492)
Saved 10cc (2/492)
Saved 3 Doors Down (3/492)
Saved 311 (band) (4/492)
Saved 38 Special (band) (5/492)
Saved A Perfect Circle (6/492)
Saved ABBA (7/492)
Saved AC/DC (8/492)
Saved AFI (band) (9/492)
Saved Accept (band) (10/492)
Saved Adam Ant (11/492)
Saved Aerosmith (12/492)
Saved Air Supply (13/492)
Saved Alanis Morissette (14/492)
Saved Alice Cooper (15/492)
Saved Alice Cooper (band) (16/492)
Saved Alice in Chains (17/492)
Saved AllMusic (18/492)
Saved Alter Bridge (19/492)
Saved Ambrosia (band) (20/492)
Saved America (band) (21/492)
Saved Anthrax (American band) (22/492)
Saved April Wine (23/492)
Saved Arcade Fire (24/492)
Saved Arctic Monkeys (25/492)
Saved Asia (band) (26/492)
Saved Audioslave (27/492)
Saved Avenged Sevenfold (28/492)
Saved Avril Lavigne (29/492)
Saved Awolnation (30/492)
Saved Bachman–Turner Overdrive (31/492)
Saved Bad Company (32/492)
Saved Badfinger (33/492)
Saved Barenaked Ladies (34/492)
Saved Bay City Rollers 

In [4]:
# Step 1: Load the saved wikitext file (from the API step earlier)
with open("rock_artists.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Step 2: Regex pattern for wiki-links
# Captures both [[Page]] and [[Page|Display text]], but only keeps "Page"
pattern = r"\[\[([^\]|#]+)(?:\|[^\]]+)?\]\]"

# Step 3: Extract raw links
raw_links = re.findall(pattern, text)

# Step 4: Clean: remove duplicates, strip whitespace
artist_links = sorted(set([link.strip() for link in raw_links]))

# Step 5: Remove unwanted links (categories, templates, etc.)
exclude_prefixes = (
    "Category:", "Template:", "File:", "Wikipedia:",
    "Help:", "Portal:", "Talk:", "Book:", "Draft:"
)

manual_excludes = [
    "AllMusic", "Rolling Stone", "rock music", "Heavy Metal", 
    "Punk Rock", "Alternative Rock", "Classic Rock", "singles",
    "album charts", "streaming", "downloads", "airplay"
]
artist_links = [
    link for link in artist_links
    if link not in manual_excludes and not link.startswith(exclude_prefixes)
]

print(f"Number of raw wiki-links found: {len(raw_links)}")
print(f"Number of unique cleaned artist links: {len(artist_links)}")
print("First 50 links before filtering:", artist_links[:30])

Number of raw wiki-links found: 492
Number of unique cleaned artist links: 488
First 50 links before filtering: ['10 Years (band)', '10cc', '3 Doors Down', '311 (band)', '38 Special (band)', 'A Perfect Circle', 'ABBA', 'AC/DC', 'AFI (band)', 'Accept (band)', 'Adam Ant', 'Aerosmith', 'Air Supply', 'Alanis Morissette', 'Alice Cooper', 'Alice Cooper (band)', 'Alice in Chains', 'Alter Bridge', 'Ambrosia (band)', 'America (band)', 'Anthrax (American band)', 'April Wine', 'Arcade Fire', 'Arctic Monkeys', 'Asia (band)', 'Audioslave', 'Avenged Sevenfold', 'Avril Lavigne', 'Awolnation', 'Bachman–Turner Overdrive']


In [None]:
# url = "https://raw.githubusercontent.com/adamajane/social-graphs-and-interactions-02805-assignments/refs/heads/main/assignment_1/Files/artists_graph_with_stats.gexf"

# # Fetch file from GitHub
# response = requests.get(url)
# response.raise_for_status()

# # Load into NetworkX
# G = nx.read_gexf(BytesIO(response.content))
# print(f"Loaded remote graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

Loaded remote graph: 484 nodes, 7328 edges
