## **Dependencies**

In [31]:
from pyspark.sql import Row, SparkSession, types as T # pyspark==4.0.0
from pyspark.sql.datasource import DataSource, DataSourceReader # pyarrow required, used: pyarrow==19.0.1
import requests # used: requests==2.32.4
# optional, for jupyter notebook: ipykernel==6.26.0

## **SparkSession**

In [32]:
spark = (
    SparkSession.builder
        .master("local[3]")
            .appName("pyspark_custom_datasource")
                .getOrCreate()
)

## **ChuckNorrisDataSource statement**

In [33]:
class ChuckNorrisDataSource(DataSource):
    """
    A DataSource for reading facts (jokes) from the Chuck Norris API.

    Name: `chucknorris`

    Schema: `id string, fact string, category string`

    Options:
        count: int 
            The number of facts to be returned. Default is 1

    Examples
    --------
    Register the data source:

    >>> from your_module import ChuckNorrisDataSource
    >>> spark.dataSource.register(ChuckNorrisDataSource)

    Load a few facts (you can specify how many):


    >>> spark.read.format("chucknorris").option("count", 5).load().show()
    +--------------------+--------------------+-----------+
    |                 id |                fact|   category|
    +--------------------+--------------------+-----------+
    | ykC28btrRCm4Vqev...| Chuck Norris can...|     animal|
    |        ...         |               ...  |       ... |
    +--------------------+--------------------+-----------+
    """

    @classmethod
    def name(cls):
        return "chucknorris"

    def schema(self):
        return T.StructType([
            T.StructField(name="id", dataType=T.StringType(), nullable=False),
            T.StructField(name="fact", dataType=T.StringType(), nullable=False),
            T.StructField(name="category", dataType=T.StringType(), nullable=False)
        ])

    def reader(self, schema):
        return ChuckNorrisReader(self.options)


class ChuckNorrisReader(DataSourceReader):
    def __init__(self, options):
        self.count = int(options.get("count", 1))  # Default: 1 fact

    def read(self, partition):
        url = "https://api.chucknorris.io/jokes/random"
        for _ in range(self.count):
            response = requests.get(url)
            response.raise_for_status()
            fact_data = response.json()
            yield Row(
                id=fact_data.get("id"),
                fact=fact_data.get("value"),
                category=(
                    fact_data.get("categories")[0]
                    if fact_data.get("categories")
                    else "uncategorized"
                )
            )

spark.dataSource.register(ChuckNorrisDataSource)

25/07/06 23:57:30 WARN DataSourceManager: The data source chucknorris replaced a previously registered data source.


## **Reading ChuckNorrisDataSource**

In [34]:
# Getting 5 random facts
(
    spark.read.format("chucknorris")
        .option("count", 5)
            .load()
                .show(truncate=False)
)


[Stage 5:>                                                          (0 + 1) / 1]

+----------------------+-------------------------------------------------------------------------------------------------------+-------------+
|id                    |joke                                                                                                   |category     |
+----------------------+-------------------------------------------------------------------------------------------------------+-------------+
|mhmp-zhqszwuimbxfbkx8q|Chuck Norris's programs can pass the Turing Test by staring at the interrogator.                       |dev          |
|xsycfh8_siooxxibfyb8nq|One time, at band camp, Chuck Norris ate a percussionist.                                              |uncategorized|
|CktaV6IqRTCdj6ReXjKxjw|Lightning never strikes the same place twice, so that it is less likely to be caught by Chuck Norris.  |uncategorized|
|ji2JjxdfQQ6e1YcgRaJ94Q|Chuck Norris put Lysol out of business, he kills odor too                                              |uncategorized|

                                                                                

In [35]:
(
    spark.read.format("chucknorris")
        .load()
          .show(truncate=False)
)

[Stage 6:>                                                          (0 + 1) / 1]

+----------------------+----------------------------------------------+-------------+
|id                    |joke                                          |category     |
+----------------------+----------------------------------------------+-------------+
|iG2l1Co7SGOi58W_ha7Olg|Chuck Norris can "shit fire and save matches"!|uncategorized|
+----------------------+----------------------------------------------+-------------+



                                                                                