# **Setup**

In [1]:
from pymongo import MongoClient
import pandas as pd

# **Connect To MongoDB Atlas Cluster**

In [2]:
# MongoDB_Cluster_Username = "username"
# MongoDB_Cluster_Password = "password"

MongoDB_Cluster_Username = input("Enter MongoDB Username: ")
MongoDB_Cluster_Password = input("Enter MongoDB Password: ")

MongoDB_Cluster_Name = 'mongodb-cluster00.kyxdfwz.mongodb.net/'

client = MongoClient(f"mongodb+srv://{MongoDB_Cluster_Username}:{MongoDB_Cluster_Password}@{MongoDB_Cluster_Name}")

## **Get List Of Databases**

In [3]:
# List Available Databases
client.list_database_names()

['sample_airbnb',
 'sample_analytics',
 'sample_geospatial',
 'sample_guides',
 'sample_mflix',
 'sample_restaurants',
 'sample_supplies',
 'sample_training',
 'sample_weatherdata',
 'admin',
 'local']

## **Get List Of Collections**

In [4]:
# Access the database collection directly
db = client['sample_mflix']

# Get List Of Collections
db.list_collection_names()

['theaters', 'sessions', 'embedded_movies', 'movies', 'users', 'comments']

## **Collection Count & Size**

In [5]:
for collection in db.list_collection_names():
    collection_count = db.movies.count_documents({})
    collection_size = db.command('collstats',collection)['size']
    collection_storagesize = db.command('collstats',collection)['storageSize']
    print(collection,'-',collection_count,collection_size,collection_storagesize)

theaters - 21349 349831 159744
sessions - 21349 540 20480
embedded_movies - 21349 75388777 72273920
movies - 21349 34119032 19779584
users - 21349 29568 36864
comments - 21349 11686292 6373376


# **Data Selection**

## **Select Random Document**

In [6]:
client.sample_mflix['movies'].find_one()

{'_id': ObjectId('573a1390f29313caabcd516c'),
 'plot': 'Original advertising for the film describes it as a drama of primitive life on the shores of the North Pacific...',
 'genres': ['Drama', 'History'],
 'runtime': 65,
 'cast': ['Stanley Hunt',
  'Sarah Constance Smith Hunt',
  'Mrs. George Walkus',
  "Paddy 'Malid"],
 'num_mflix_comments': 1,
 'poster': 'https://m.media-amazon.com/images/M/MV5BMjE3MjAyNzM5NV5BMl5BanBnXkFtZTgwMjA5OTg5NjE@._V1_SY1000_SX677_AL_.jpg',
 'title': 'In the Land of the Head Hunters',
 'lastupdated': '2015-09-16 12:11:37.770000000',
 'languages': ['English'],
 'released': datetime.datetime(1914, 12, 7, 0, 0),
 'directors': ['Edward S. Curtis'],
 'writers': ['Edward S. Curtis (story)'],
 'awards': {'wins': 1, 'nominations': 0, 'text': '1 win.'},
 'year': 1914,
 'imdb': {'rating': 5.8, 'votes': 223, 'id': 4150},
 'countries': ['USA'],
 'type': 'movie',
 'tomatoes': {'viewer': {'rating': 2.7, 'numReviews': 64, 'meter': 18},
  'dvd': datetime.datetime(2000, 8, 15

## **Get Distinct Values For A Field**

In [7]:
genres_list = db['movies'].distinct('genres')

print('No. Of Genres: ', len(genres_list))
print(genres_list)

No. Of Genres:  25
['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']


In [28]:
# Distinct Values In Countries Field
db['movies'].distinct('countries')

['Afghanistan',
 'Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Bahamas',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burma',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Czechoslovakia',
 "Cète d'Ivoire",
 'Denmark',
 'Dominican Republic',
 'East Germany',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Ethiopia',
 'Faroe Islands',
 'Federal Republic of Yugoslavia',
 'Finland',
 'France',
 'Gabon',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Greenland',
 'Guatemala',
 'Haiti',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Isle Of Man',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kosovo',
 'Kuwait',
 'Kyrgyzstan',

In [27]:
# Distinct Values In Type Field
db['movies'].distinct('type')

['movie', 'series']

In [42]:
# Distinct Values In Ratings Field
db['movies'].distinct('rated')

['AO',
 'APPROVED',
 'Approved',
 'G',
 'GP',
 'M',
 'Not Rated',
 'OPEN',
 'PASSED',
 'PG',
 'PG-13',
 'R',
 'TV-14',
 'TV-G',
 'TV-MA',
 'TV-PG',
 'TV-Y7']

## **Single Criteria Filtering**

In [22]:
# Select movies released in 2015
for movie in db['movies'].find({'year': 2015}):
    print(movie)

{'_id': ObjectId('573a13adf29313caabd2b765'), 'plot': "A new theme park is built on the original site of Jurassic Park. Everything is going well until the park's newest attraction--a genetically modified giant stealth killing machine--escapes containment and goes on a killing spree.", 'genres': ['Action', 'Adventure', 'Sci-Fi'], 'runtime': 124, 'metacritic': 59, 'rated': 'PG-13', 'cast': ['Chris Pratt', 'Bryce Dallas Howard', 'Irrfan Khan', "Vincent D'Onofrio"], 'num_mflix_comments': 0, 'poster': 'https://m.media-amazon.com/images/M/MV5BNzQ3OTY4NjAtNzM5OS00N2ZhLWJlOWUtYzYwZjNmOWRiMzcyXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_SY1000_SX677_AL_.jpg', 'title': 'Jurassic World', 'fullplot': '22 years after the original Jurassic Park failed, the new park (also known as Jurassic World) is open for business. After years of studying genetics the scientists on the park genetically engineer a new breed of dinosaur. When everything goes horribly wrong, will our heroes make it off the island?', 'languages':

## **Multi-Criteria Filtering**

In [33]:
# Select Series Released In 2015
for series in db['movies'].find({"year": 2015, "type": "series"}):
    print(series)

{'_id': ObjectId('573a13dff29313caabdb76d3'), 'plot': 'The citizens of the small British town of Pagford fight for the spot on the parish council after Barry Fairbrother dies.', 'genres': ['Crime', 'Drama', 'Mystery'], 'runtime': 180, 'cast': ['Silas Carson', 'Joe Hurst', 'Michael Gambon', 'Rory Kinnear'], 'poster': 'https://m.media-amazon.com/images/M/MV5BMTkyNDgzOTcwNV5BMl5BanBnXkFtZTgwMTk4MDI1NTE@._V1_SY1000_SX677_AL_.jpg', 'title': 'The Casual Vacancy', 'fullplot': 'When Parish Councilman Barry Fairbrother unexpectedly dies in his early forties, the town of Pagford is left in shock. An English idyll, with a cobbled market square and an ancient abbey, Pagford is not what it first seems. What lies behind the pretty facade is a town at war - rich at war with poor, teenagers at war with their parents, wives at war with their husbands, teachers at war with their pupils. And the empty seat left by Barry on the parish council soon becomes the catalyst for the biggest battle the town has y

In [36]:
# Select Movies Released In 2015
for movie in db['movies'].find({"year": 2015, "type": "movie"}):
    print(movie)

{'_id': ObjectId('573a13adf29313caabd2b765'), 'plot': "A new theme park is built on the original site of Jurassic Park. Everything is going well until the park's newest attraction--a genetically modified giant stealth killing machine--escapes containment and goes on a killing spree.", 'genres': ['Action', 'Adventure', 'Sci-Fi'], 'runtime': 124, 'metacritic': 59, 'rated': 'PG-13', 'cast': ['Chris Pratt', 'Bryce Dallas Howard', 'Irrfan Khan', "Vincent D'Onofrio"], 'num_mflix_comments': 0, 'poster': 'https://m.media-amazon.com/images/M/MV5BNzQ3OTY4NjAtNzM5OS00N2ZhLWJlOWUtYzYwZjNmOWRiMzcyXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_SY1000_SX677_AL_.jpg', 'title': 'Jurassic World', 'fullplot': '22 years after the original Jurassic Park failed, the new park (also known as Jurassic World) is open for business. After years of studying genetics the scientists on the park genetically engineer a new breed of dinosaur. When everything goes horribly wrong, will our heroes make it off the island?', 'languages':

# **Data Export**

## **To Pandas Dataframe**

In [66]:
# Query Movies Released Between 2000 & 2010 
# Export The Output To Pandas Dataframe

query = {'year':{"$gte":2000,'$lte':2010},'type':'series','countries':'UK'}
projection = {'_id':0,'title':1,'year':1,'rated':1,'cast':2,'countries':1}

df = pd.DataFrame(list(db['movies'].find(query,projection)))
print(df.shape)
print(df.columns)

df.head()

(42, 5)
Index(['cast', 'title', 'rated', 'year', 'countries'], dtype='object')


Unnamed: 0,cast,title,rated,year,countries
0,"[Scott Grimes, Matthew Leitch, Damian Lewis, R...",Band of Brothers,TV-MA,2001,"[UK, USA]"
1,[Keith David],Jazz,,2001,"[UK, USA]"
2,"[David Thewlis, Katie Carr, Jim Carter, Alice ...",Dinotopia,,2002,"[UK, Germany, USA]"
3,"[Kenneth Branagh, Stockard Channing]",Walking with Prehistoric Beasts,,2001,[UK]
4,"[Pierce Brosnan, David Attenborough]",The Blue Planet,,2001,[UK]


# **Data Aggregation**

## **Movie Count by Genre**

In [8]:
# Number of movies in each genre
client.sample_mflix['movies'].find({},{"genres":1}).distinct("genres")

['Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western']

In [9]:
#  Count of movies in the database by distinct genres
# db.movies.aggregate([
#     {$unwind: "$genres"},
#     {$group: {_id: "$genres", count: { $sum: 1 }}},
#     {$sort: {count: -1}}
#     ])

SyntaxError: invalid syntax (3791244610.py, line 3)