In [19]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
spark = SparkSession.builder.appName("wikimedia").getOrCreate()

In [7]:
!pwd

/home/jovyan/work


In [9]:
df = spark.read.format("json").option("path","/home/jovyan/work/stream2025-05-03_07_46_20Z_bdc832").load()

# https://www.mediawiki.org/wiki/API:RecentChanges


| Field       | Type      | Description                                                                                  |
| ----------- | --------- | -------------------------------------------------------------------------------------------- |
| `type`      | `string`  | Type of change. Values include: `edit`, `new`, `log`, `categorize`, `external` (rare).       |
| `ns`        | `int`     | Namespace number. For example: `0` = main/article, `1` = talk, `2` = user, etc.              |
| `title`     | `string`  | Title of the page that was changed.                                                          |
| `pageid`    | `int`     | ID of the page.                                                                              |
| `revid`     | `int`     | Revision ID (new version).                                                                   |
| `old_revid` | `int`     | Revision ID before the change.                                                               |
| `rcid`      | `int`     | Recent change ID (unique to this change event).                                              |
| `user`      | `string`  | Username or IP address of the editor.                                                        |
| `anon`      | `boolean` | Indicates if the user is anonymous (IP edit).                                                |
| `bot`       | `boolean` | Indicates if the change was made by a bot.                                                   |
| `minor`     | `boolean` | Indicates a "minor edit" (user marked it as such).                                           |
| `patrolled` | `boolean` | Indicates if the edit was patrolled (for wikis that use this feature).                       |
| `comment`   | `string`  | Edit summary/comment entered by the editor.                                                  |
| `timestamp` | `string`  | ISO 8601 timestamp of the change.                                                            |
| `logtype`   | `string`  | For log entries: type of log (e.g., `block`, `delete`, `move`). Only appears for `type=log`. |
| `logaction` | `string`  | Specific action within the log type.                                                         |


In [10]:
df.printSchema()

root
 |-- $schema: string (nullable = true)
 |-- bot: boolean (nullable = true)
 |-- comment: string (nullable = true)
 |-- id: long (nullable = true)
 |-- length: struct (nullable = true)
 |    |-- new: long (nullable = true)
 |    |-- old: long (nullable = true)
 |-- log_action: string (nullable = true)
 |-- log_action_comment: string (nullable = true)
 |-- log_id: long (nullable = true)
 |-- log_params: string (nullable = true)
 |-- log_type: string (nullable = true)
 |-- meta: struct (nullable = true)
 |    |-- domain: string (nullable = true)
 |    |-- dt: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- offset: long (nullable = true)
 |    |-- partition: long (nullable = true)
 |    |-- request_id: string (nullable = true)
 |    |-- stream: string (nullable = true)
 |    |-- topic: string (nullable = true)
 |    |-- uri: string (nullable = true)
 |-- minor: boolean (nullable = true)
 |-- namespace: long (nullable = true)
 |-- notify_url: string (nullable 

Edit Activity Monitoring

    Volume of edits over time (e.g., per hour, per day).

    Compare activity across projects (e.g., English vs. French Wikipedia).

    Detect high activity spikes, which might indicate newsworthy events or vandalism.

In [34]:
df = df.withColumn("datetime" ,f.from_unixtime(f.col('timestamp')))
df = df.withColumn("minute", f.date_format(f.col('datetime'), "mm"))

In [40]:
df_edit_per_minute = df.groupBy(["minute"]).agg(f.count(f.col("id")).alias("edit_per_minute"))

In [41]:
df_edit_per_minute.show()

+------+---------------+
|minute|edit_per_minute|
+------+---------------+
|    47|             75|
|    46|             65|
+------+---------------+



In [27]:
df.select('datetime').distinct().show(5)

+-------------------+
|           datetime|
+-------------------+
|2025-05-03 07:46:57|
|2025-05-03 07:47:48|
|2025-05-03 07:47:05|
|2025-05-03 07:47:41|
|2025-05-03 07:47:17|
+-------------------+
only showing top 5 rows



In [20]:
q.awaitTermination()

StreamingQueryException: [STREAM_FAILED] Query [id = e4c4711f-1d93-4ea8-a569-03e6c50b315c, runId = ac489457-0787-434e-911e-b5484bc7fbdb] terminated with exception: wss://stream.binance.com