Doc:
- [spark-packages.org](https://spark-packages.org/package/mongodb/mongo-spark)

In [1]:
from pyspark.sql.session import SparkSession

In [2]:
user = "root"
password = "secret"
host = "db.mongo.app.com"
database = "starwars"
collection = "people"

uri = f"mongodb://{user}:{password}@{host}"

In [5]:
spark = SparkSession.builder \
    .master("local") \
    .appName("SparkMongo") \
    .config("spark.mongodb.input.uri", uri) \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

In [11]:
uri

'mongodb://root:secret@db.mongo.app.com/starwars.people'

In [17]:
from pymongo import MongoClient

In [18]:
client = MongoClient(uri)

In [19]:
client

MongoClient(host=['db.mongo.app.com:27017'], document_class=dict, tz_aware=False, connect=True)

In [20]:
client.list_database_names()

['admin', 'config', 'local', 'people-bson', 'starwars', 'test']

In [122]:
pipeline = [{
    "$project": {
        "_id": 0,
        "name": '$homeworld.name',
        "rotation_period": '$homeworld.rotation_period',
        "orbital_period": '$homeworld.orbital_period',
        "diameter": '$homeworld.diameter',
        "climate": '$homeworld.climate'
        }
    }
]

In [123]:
people = client.starwars.get_collection("people")

In [124]:
cursor = people.aggregate(pipeline)

In [125]:
cursor.next()

{'name': 'Tatooine',
 'rotation_period': 23,
 'orbital_period': 304,
 'diameter': 10465,
 'climate': 'arid'}

In [126]:
people.find_one()

{'_id': ObjectId('5d31e79f5decab6c5ac11358'),
 'name': 'Luke Skywalker',
 'height': 172,
 'mass': 77,
 'hair_color': 'blond',
 'skin_color': 'fair',
 'eye_color': 'blue',
 'birth_year': '19BBY',
 'gender': 'male',
 'homeworld': {'name': 'Tatooine',
  'rotation_period': 23,
  'orbital_period': 304,
  'diameter': 10465,
  'climate': 'arid',
  'gravity': '1 standard',
  'terrain': 'desert',
  'surface_water': 1,
  'population': 200000,
  'residents': ['https://swapi.co/api/people/1/',
   'https://swapi.co/api/people/2/',
   'https://swapi.co/api/people/4/',
   'https://swapi.co/api/people/6/',
   'https://swapi.co/api/people/7/',
   'https://swapi.co/api/people/8/',
   'https://swapi.co/api/people/9/',
   'https://swapi.co/api/people/11/',
   'https://swapi.co/api/people/43/',
   'https://swapi.co/api/people/62/'],
  'films': ['https://swapi.co/api/films/5/',
   'https://swapi.co/api/films/4/',
   'https://swapi.co/api/films/6/',
   'https://swapi.co/api/films/3/',
   'https://swapi.co/ap

In [127]:
df = spark.read.format("mongo").\
    option("spark.mongodb.input.database", database).\
    option("spark.mongodb.input.collection", collection).\
    option("pipeline", pipeline).\
    load()

In [128]:
df.show()

+---------+--------+--------+--------------+---------------+
|  climate|diameter|    name|orbital_period|rotation_period|
+---------+--------+--------+--------------+---------------+
|     arid|   10465|Tatooine|           304|             23|
|     arid|   10465|Tatooine|           304|             23|
|temperate|   12120|   Naboo|           312|             26|
|     arid|   10465|Tatooine|           304|             23|
|temperate|   12500|Alderaan|           364|             24|
|     arid|   10465|Tatooine|           304|             23|
|     arid|   10465|Tatooine|           304|             23|
|     arid|   10465|Tatooine|           304|             23|
|     arid|   10465|Tatooine|           304|             23|
|temperate|       0| Stewjon|       unknown|        unknown|
+---------+--------+--------+--------------+---------------+



In [132]:
df.distinct().show()

+---------+--------+--------+--------------+---------------+
|  climate|diameter|    name|orbital_period|rotation_period|
+---------+--------+--------+--------------+---------------+
|     arid|   10465|Tatooine|           304|             23|
|temperate|   12500|Alderaan|           364|             24|
|temperate|   12120|   Naboo|           312|             26|
|temperate|       0| Stewjon|       unknown|        unknown|
+---------+--------+--------+--------------+---------------+

