# MongoDB Part 2

In [6]:
from pymongo import MongoClient

In [7]:
# Create a MongoClient against the running mongod instance
client = MongoClient('localhost:27017')
# client = MongoClient('localhost',27017)
# client = MongoClient('mongodb://localhost:27017/')

In [8]:
# Accessing a database
# MongoDB creates new databases implicitly upon their first use (lazy creation)
mydb = client.phc7065
# mydb = client['phc7065']

In [9]:
# Documents in dictionary
student1 = {"_id":1,
            "name": "John",
            "hometown": "Gainesville, Florida",
            "age":20,
            "courses": ["a","b","c"],
            "department": "a",
            "college": "a"
           }
student2 = {"_id":2,
            "name": "Mike",
            "hometown": "Gainesville, Georgia",
            "age":23,
            "courses": ["b","a","c"],
            "department": "b",
            "college": "b"
           }
student3 = {"_id":3,
            "name": "Tom",
            "hometown": "Miami, Florida",
            "age":19,
            "courses": ["c","b","a"],
            "department": "b",
            "college":"b"
           }
student4 = {"_id":4,
            "name": "Joe",
            "hometown": "Jacksonville, Florida",
            "age":19,
            "courses": ["d","e"],
            "department": "a",
            "college": "c"
           }

In [10]:
mydb.Students2.drop()
# Document Insert
mydb.Students2.insert_many([student1,student2,student3,student4])

<pymongo.results.InsertManyResult at 0x7fe9781c7050>

In [11]:
for student in mydb.Students2.find():
    print student

{u'name': u'John', u'hometown': u'Gainesville, Florida', u'age': 20, u'courses': [u'a', u'b', u'c'], u'college': u'a', u'department': u'a', u'_id': 1}
{u'name': u'Mike', u'hometown': u'Gainesville, Georgia', u'age': 23, u'courses': [u'b', u'a', u'c'], u'college': u'b', u'department': u'b', u'_id': 2}
{u'name': u'Tom', u'hometown': u'Miami, Florida', u'age': 19, u'courses': [u'c', u'b', u'a'], u'college': u'b', u'department': u'b', u'_id': 3}
{u'name': u'Joe', u'hometown': u'Jacksonville, Florida', u'age': 19, u'courses': [u'd', u'e'], u'college': u'c', u'department': u'a', u'_id': 4}


## Counting and Distinct

In [12]:
mydb.Students2.count({'age':{'$exists':True}})

4

In [13]:
mydb.Students2.count('age')

4

In [14]:
len(mydb.Students2.distinct('age'))

3

## Aggregation

In [15]:
res = mydb.Students2.aggregate([
    {'$group':{'_id':'$department','meanAge':{'$avg':'$age'}}}
])

for r in res:
    print r

{u'_id': u'b', u'meanAge': 21.0}
{u'_id': u'a', u'meanAge': 19.5}


In [16]:
res = mydb.Students2.aggregate([
    {'$group':{'_id':'department','meanAge':{'$avg':'age'}}}
])

for r in res:
    print r

{u'_id': u'department', u'meanAge': None}


In [17]:
res = mydb.Students2.aggregate([
    {'$match':{'courses':{'$in':['a']}}},
    {'$group':{'_id':'$department','meanAge':{'$avg':'$age'}}}
])

for r in res:
    print r

{u'_id': u'b', u'meanAge': 21.0}
{u'_id': u'a', u'meanAge': 20.0}


In [18]:
res = mydb.Students2.aggregate([
    {'$match':{'courses':'a'}},
    {'$group':{'_id':'$department','meanAge':{'$avg':'$age'}}}
])

for r in res:
    print r

{u'_id': u'b', u'meanAge': 21.0}
{u'_id': u'a', u'meanAge': 20.0}


In [19]:
#multi-attribute aggregation
res = mydb.Students2.aggregate([
    {'$group':{'_id':{'department':'$department','college':'$college'},'meanAge':{'$avg':'$age'}}},
    {'$sort':{'meanAge':1}}
])

for r in res:
    print r

{u'_id': {u'department': u'a', u'college': u'c'}, u'meanAge': 19.0}
{u'_id': {u'department': u'a', u'college': u'a'}, u'meanAge': 20.0}
{u'_id': {u'department': u'b', u'college': u'b'}, u'meanAge': 21.0}


In [20]:
#aggregation with text search
from pymongo import TEXT

#create a text index
mydb.Students2.create_index([('hometown',TEXT)])


res = mydb.Students2.aggregate([
    {'$match':{'$text':{'$search':'Miami Florida'}}},
    {'$sort':{'score':{'$meta':'textScore'}}},
    {'$project':{'hometown':1,'name':1}}
])

for r in res:
    print r


{u'hometown': u'Miami, Florida', u'_id': 3, u'name': u'Tom'}
{u'hometown': u'Gainesville, Florida', u'_id': 1, u'name': u'John'}
{u'hometown': u'Jacksonville, Florida', u'_id': 4, u'name': u'Joe'}


## Join

In [21]:
# Create some data
college1 = {"_id":1,
            "name": "a",
            "nStudents": 300
           }
college2 = {"_id":2,
            "name": "b",
            "nStudents": 400
           }
college3 = {"_id":3,
            "name": "c",
            "nStudents": 500
           }
college4 = {"_id":4,
            "name": "d",
            "nStudents": 600
           }

In [22]:
mydb.Colleges.drop()
# Document Insert
mydb.Colleges.insert_many([college1,college2,college3,college4])

<pymongo.results.InsertManyResult at 0x7fe9781551b8>

In [23]:
for student in mydb.Students2.find():
    print student

{u'name': u'John', u'hometown': u'Gainesville, Florida', u'age': 20, u'courses': [u'a', u'b', u'c'], u'college': u'a', u'department': u'a', u'_id': 1}
{u'name': u'Mike', u'hometown': u'Gainesville, Georgia', u'age': 23, u'courses': [u'b', u'a', u'c'], u'college': u'b', u'department': u'b', u'_id': 2}
{u'name': u'Tom', u'hometown': u'Miami, Florida', u'age': 19, u'courses': [u'c', u'b', u'a'], u'college': u'b', u'department': u'b', u'_id': 3}
{u'name': u'Joe', u'hometown': u'Jacksonville, Florida', u'age': 19, u'courses': [u'd', u'e'], u'college': u'c', u'department': u'a', u'_id': 4}


In [24]:
for col in mydb.Colleges.find():
    print col

{u'nStudents': 300, u'_id': 1, u'name': u'a'}
{u'nStudents': 400, u'_id': 2, u'name': u'b'}
{u'nStudents': 500, u'_id': 3, u'name': u'c'}
{u'nStudents': 600, u'_id': 4, u'name': u'd'}


In [25]:
res = mydb.Colleges.aggregate([
    {
        '$lookup':{
            'from':'Students2',
            'localField':'name',
            'foreignField':'college',
            'as':'students'
        }
    }
])
for r in res:
    print r

{u'students': [{u'name': u'John', u'hometown': u'Gainesville, Florida', u'age': 20, u'courses': [u'a', u'b', u'c'], u'college': u'a', u'department': u'a', u'_id': 1}], u'nStudents': 300, u'_id': 1, u'name': u'a'}
{u'students': [{u'name': u'Mike', u'hometown': u'Gainesville, Georgia', u'age': 23, u'courses': [u'b', u'a', u'c'], u'college': u'b', u'department': u'b', u'_id': 2}, {u'name': u'Tom', u'hometown': u'Miami, Florida', u'age': 19, u'courses': [u'c', u'b', u'a'], u'college': u'b', u'department': u'b', u'_id': 3}], u'nStudents': 400, u'_id': 2, u'name': u'b'}
{u'students': [{u'name': u'Joe', u'hometown': u'Jacksonville, Florida', u'age': 19, u'courses': [u'd', u'e'], u'college': u'c', u'department': u'a', u'_id': 4}], u'nStudents': 500, u'_id': 3, u'name': u'c'}
{u'students': [], u'nStudents': 600, u'_id': 4, u'name': u'd'}


In [26]:
res = mydb.Students2.aggregate([
    {
        '$lookup':{
            'from':'Colleges',
            'localField':'college',
            'foreignField':'name',
            'as':'colleges'
        }
    }
])
for r in res:
    print r

{u'colleges': [{u'nStudents': 300, u'_id': 1, u'name': u'a'}], u'name': u'John', u'hometown': u'Gainesville, Florida', u'age': 20, u'courses': [u'a', u'b', u'c'], u'college': u'a', u'department': u'a', u'_id': 1}
{u'colleges': [{u'nStudents': 400, u'_id': 2, u'name': u'b'}], u'name': u'Mike', u'hometown': u'Gainesville, Georgia', u'age': 23, u'courses': [u'b', u'a', u'c'], u'college': u'b', u'department': u'b', u'_id': 2}
{u'colleges': [{u'nStudents': 400, u'_id': 2, u'name': u'b'}], u'name': u'Tom', u'hometown': u'Miami, Florida', u'age': 19, u'courses': [u'c', u'b', u'a'], u'college': u'b', u'department': u'b', u'_id': 3}
{u'colleges': [{u'nStudents': 500, u'_id': 3, u'name': u'c'}], u'name': u'Joe', u'hometown': u'Jacksonville, Florida', u'age': 19, u'courses': [u'd', u'e'], u'college': u'c', u'department': u'a', u'_id': 4}


In [27]:
import pandas as pd
pd.DataFrame(list(res))

In [28]:
res=mydb.Students2.aggregate([
    {
        '$lookup':{
            'from':'Colleges',
            'localField':'college',
            'foreignField':'name',
            'as':'colleges'
        }
    }
])
pd.DataFrame(list(res))

Unnamed: 0,_id,age,college,colleges,courses,department,hometown,name
0,1,20,a,"[{u'nStudents': 300, u'_id': 1, u'name': u'a'}]","[a, b, c]",a,"Gainesville, Florida",John
1,2,23,b,"[{u'nStudents': 400, u'_id': 2, u'name': u'b'}]","[b, a, c]",b,"Gainesville, Georgia",Mike
2,3,19,b,"[{u'nStudents': 400, u'_id': 2, u'name': u'b'}]","[c, b, a]",b,"Miami, Florida",Tom
3,4,19,c,"[{u'nStudents': 500, u'_id': 3, u'name': u'c'}]","[d, e]",a,"Jacksonville, Florida",Joe


## Import JSON files to MongoDB

In [29]:
mydb.tweets.drop()
!mongoimport --db phc7065 --collection tweets --type json --file tweet.json

2018-02-15T15:38:35.014-0500	connected to: localhost
2018-02-15T15:38:35.338-0500	imported 3207 documents


In [30]:
mydb.tweets.find_one()

{u'_id': ObjectId('5a85efcbd42fa31c50979466'),
 u'contributors': None,
 u'coordinates': None,
 u'created_at': u'Tue Aug 23 12:56:46 +0000 2016',
 u'entities': {u'hashtags': [],
  u'symbols': [],
  u'urls': [],
  u'user_mentions': [{u'id': 1367531,
    u'id_str': u'1367531',
    u'indices': [72, 80],
    u'name': u'Fox News',
    u'screen_name': u'FoxNews'}]},
 u'favorite_count': 7826,
 u'favorited': False,
 u'geo': None,
 u'id': 768069472464666624L,
 u'id_str': u'768069472464666624',
 u'in_reply_to_screen_name': None,
 u'in_reply_to_status_id': None,
 u'in_reply_to_status_id_str': None,
 u'in_reply_to_user_id': None,
 u'in_reply_to_user_id_str': None,
 u'is_quote_status': False,
 u'lang': u'en',
 u'place': None,
 u'retweet_count': 2547,
 u'retweeted': False,
 u'source': u'<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',
 u'text': u'I am now in Texas doing a big fundraiser for the Republican Party and a @FoxNews Special on the BORDER and with victim