# Wikipedia Page Views Pipeline
 Name and ID: Balint Decsi, 2506626

## Choose Username

In [1]:
USERNAME = "balintd-de1"

## Setup and Imports

In [2]:
import datetime
import json

import boto3
import requests

## Extract: Retrieve Data from Wikipedia API

**API Documentation:** https://doc.wikimedia.org/generated-data-platform/aqs/analytics-api/reference/edits.html

In [3]:
# Try different dates to see how the data changes
DATE_PARAM = "2025-11-27"

date = datetime.datetime.strptime(DATE_PARAM, "%Y-%m-%d")

# Construct the API URL
url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia.org/all-access/{date.strftime('%Y/%m/%d')}"
print(f"Requesting REST API URL: {url}")

# Make the API request
wiki_server_response = requests.get(url, headers={"User-Agent": "curl/7.68.0"})
wiki_response_status = wiki_server_response.status_code
wiki_response_body = wiki_server_response.text

print(f"Wikipedia REST API Response body: {wiki_response_body[:500]}...")
print(f"Wikipedia REST API Response Code: {wiki_response_status}")

# Validate response
if wiki_response_status != 200:
    raise Exception(f"Received non-OK status code from Wiki Server: {wiki_response_status}")
print(f"Successfully retrieved Wikipedia data, content-length: {len(wiki_response_body)}")

Requesting REST API URL: https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia.org/all-access/2025/11/27
Wikipedia REST API Response body: {"items":[{"project":"en.wikipedia","access":"all-access","year":"2025","month":"11","day":"27","articles":[{"article":"Main_Page","views":5988101,"rank":1},{"article":"Special:Search","views":797249,"rank":2},{"article":"Stranger_Things_season_5","views":655701,"rank":3},{"article":"Stranger_Things","views":353714,"rank":4},{"article":"Jack_White","views":349384,"rank":5},{"article":"Google_Chrome","views":306639,"rank":6},{"article":"Wikipedia:Featured_pictures","views":248728,"rank":7},{"arti...
Wikipedia REST API Response Code: 200
Successfully retrieved Wikipedia data, content-length: 55549


## Transform: Process Raw Data into JSON Lines

Convert the raw API response into a structured JSON Lines format suitable for analytics. Each line is a valid JSON object representing one page's edit statistics.

In [11]:
wiki_response_parsed = wiki_server_response.json()
top_views = wiki_response_parsed["items"][0]["articles"]
top_views

[{'article': 'Main_Page', 'views': 5988101, 'rank': 1},
 {'article': 'Special:Search', 'views': 797249, 'rank': 2},
 {'article': 'Stranger_Things_season_5', 'views': 655701, 'rank': 3},
 {'article': 'Stranger_Things', 'views': 353714, 'rank': 4},
 {'article': 'Jack_White', 'views': 349384, 'rank': 5},
 {'article': 'Google_Chrome', 'views': 306639, 'rank': 6},
 {'article': 'Wikipedia:Featured_pictures', 'views': 248728, 'rank': 7},
 {'article': 'List_of_Stranger_Things_episodes', 'views': 203875, 'rank': 8},
 {'article': 'Thanksgiving', 'views': 191411, 'rank': 9},
 {'article': 'Post_Malone', 'views': 166407, 'rank': 10},
 {'article': 'Millie_Bobby_Brown', 'views': 163190, 'rank': 11},
 {'article': 'Miss_International_2025', 'views': 140289, 'rank': 12},
 {'article': 'Bruce_Lee', 'views': 138876, 'rank': 13},
 {'article': '2025_Tai_Po_apartment_fire', 'views': 138738, 'rank': 14},
 {'article': 'Zootopia_2', 'views': 136397, 'rank': 15},
 {'article': 'Deaths_in_2025', 'views': 135939, 'r

In [None]:
# Parse the API response and extract top edits
wiki_response_parsed = wiki_server_response.json()
top_views = wiki_response_parsed["items"][0]["articles"]

# Transform to JSON Lines format
current_time = datetime.datetime.now(datetime.timezone.utc)
json_lines = ""
for page in top_views[:5]:
    record = {
        "title": page["article"],
        "views": page["views"],
        "rank": page["rank"],
        "date": date.strftime("%Y-%m-%d"),
        "retrieved_at": current_time.replace(tzinfo=None).isoformat(),
    }
    json_lines += json.dumps(record) + "\n"

print(f"Transformed {len(top_views)} records to JSON Lines")
print(f"First few lines:\n{json_lines[:500]}...")

---
## Lab 1: Create an S3 Bucket

**Task:** Create an S3 bucket for the Wikipedia data pipeline.

**Requirements:**
- Bucket name: `<username>-wikidata` (use your USERNAME from above)
- Create the bucket if it doesn't exist

**Documentation:** [create_bucket](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/create_bucket.html)

In [None]:
S3_WIKI_BUCKET = USERNAME + "-wikidata"
s3 = boto3.client("s3")

# List existing buckets
# Note: If this raises NoCredentialsError, you need to configure AWS credentials
response = s3.list_buckets()
bucket_names = [bucket["Name"] for bucket in response["Buckets"]]
if S3_WIKI_BUCKET not in bucket_names:
    # LAB 1: Create the bucket if it doesn't exist
    # YOUR SOLUTION COMES HERE =========================
    s3.create_bucket(
        Bucket=S3_WIKI_BUCKET,
        CreateBucketConfiguration={"LocationConstraint": boto3.session.Session().region_name},
    )
    # ==================================================
    print(f"Created new bucket: {S3_WIKI_BUCKET}")
else:
    print(f"Using existing bucket: {S3_WIKI_BUCKET}")


In [None]:
# Test Lab 1
assert USERNAME != "<username>", "Please set your USERNAME at the top of the notebook"
assert S3_WIKI_BUCKET.endswith("-wikidata"), "Bucket name must end with '-wikidata'"

try:
    s3.head_bucket(Bucket=S3_WIKI_BUCKET)
    print(f"Bucket {S3_WIKI_BUCKET} exists!")
except Exception as e:
    print(f"Bucket {S3_WIKI_BUCKET} not found: {e}")
    raise

---
## Lab 2: Upload JSON Lines to S3

**Task:** Upload the `json_lines` data directly to S3 (no local file!).

**Requirements:**
- Use `s3.put_object()` to upload the data directly
- Place the file under `raw-edits/` prefix in S3
- File name: `raw-edits-YYYY-MM-DD.json` (use the date from `DATE_PARAM`)

**Example S3 path:** `s3://johndoe-wikidata/raw-edits/raw-edits-2025-11-25.json`

**Documentation:** [put_object](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html)

In [None]:
# LAB 2: Upload json_lines directly to S3
# YOUR SOLUTION COMES HERE =========================
s3_key = f"raw-edits/raw-edits-{date.strftime('%Y-%m-%d')}.json"
s3.put_object(
    Bucket=S3_WIKI_BUCKET,
    Key=s3_key,
    Body=json_lines,
)
# ==================================================

In [None]:
# Test Lab 2
expected_key = f"raw-edits/raw-edits-{date.strftime('%Y-%m-%d')}.json"
try:
    s3.head_object(Bucket=S3_WIKI_BUCKET, Key=expected_key)
    print(f"File uploaded successfully to s3://{S3_WIKI_BUCKET}/{expected_key}")
except Exception as e:
    print(f"File not found at s3://{S3_WIKI_BUCKET}/{expected_key}")
    raise