# Parse data stored in a Json file and save as a collection in MongoDB using pymongo

The format of "reviews_electronics.16.json" is:

• reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B

• asin - ID of the product, e.g. 0000013714

• reviewerName - name of the reviewer

• helpful - helpfulness rating of the review, e.g. 2/3

• reviewText - text of the review

• overall - rating of the product

• summary - summary of the review

• unixReviewTime - time of the review


In [1]:
from pymongo import MongoClient
import pymongo

In [4]:
#client = MongoClient(‘localhost’, 27017)
client = MongoClient("mongodb://localhost:27017/")

In [29]:
#create a MongoDB DB called "amazon"
database= client["amazon"]
#create a collection "reviews" in the database "amazon"
collection = database["reviews"]

### reads json file and uploads each review as a separate document to the collection "reviews" in the DB "amazon".

In [13]:
#read json file and uploads each review as a separate document to the collection "reviews" in the DB "amazon".
json_file = open("reviews_electronics.16.json")
for element in json_file:
    element_dict = json.loads(element)
    collection.insert_one(element_dict)

### use MongoDB's map reduce function to build a new collection "avg_scores" that averages review scores by product ("asin")

In [33]:
map_fxn = """function(){emit(this.asin, this.helpful[0])};"""
reduce_fxn = """function(key, value){return Array.avg(value);};"""
database.reviews.map_reduce(map_fxn,reduce_fxn,"avg_scores")
#print the first 100 entries of "avg_scores" to screen.
for record in database.avg_scores.find().limit(100):
    print(record)

{'_id': 'B00HVRB6V2', 'value': 0.0}
{'_id': 'B00FCRH4LE', 'value': 1.0}
{'_id': 'B00HPMCN8S', 'value': 6.666666666666667}
{'_id': 'B00HNXZXYU', 'value': 0.0}
{'_id': 'B00IOQOABM', 'value': 0.0}
{'_id': 'B00EZ8CWVS', 'value': 0.0}
{'_id': 'B00EPDMDZ8', 'value': 10.294117647058824}
{'_id': 'B00I4YTC1M', 'value': 0.0}
{'_id': 'B00I0MKCOO', 'value': 0.6666666666666666}
{'_id': 'B00GZAOAVE', 'value': 0.0}
{'_id': 'B00HUZTX4W', 'value': 1.6666666666666667}
{'_id': 'B00J3UL4QW', 'value': 0.0}
{'_id': 'B00IIP37HC', 'value': 0.0}
{'_id': 'B00F71CQ9K', 'value': 0.0}
{'_id': 'B00FEDQGN8', 'value': 1.0}
{'_id': 'B00JI7QMJ4', 'value': 0.0}
{'_id': 'B00HK8VB0S', 'value': 3.0}
{'_id': 'B00HHZTCEQ', 'value': 0.3333333333333333}
{'_id': 'B00FKCNMSK', 'value': 0.0}
{'_id': 'B00HDPEEH0', 'value': 0.0}
{'_id': 'B00HYKF9XM', 'value': 0.875}
{'_id': 'B00H06VXGC', 'value': 0.0}
{'_id': 'B00EQFIBTW', 'value': 0.5}
{'_id': 'B00GFXTBNS', 'value': 0.8}
{'_id': 'B00HNP59FQ', 'value': 0.3333333333333333}
{'_id': '

### use MongoDB's map reduce function to build a new collection "weighted_avg_scores" that averages review scores by product ("asin"), weighted by the number of votes + 1 (the second number + 1). 

In [55]:
map_fxn = """function(){emit(this.asin, {num:(this.helpful[0]*(this.helpful[1]+1)),deno:(this.helpful[1]+1)})};"""

reduce_fxn = """function reduce(key, values) {
    var num = 0;
    var deno = 0;

    values.forEach(function(value) {
        num += value.num;
        deno += value.deno;
    });
    
    
    return {weighted_avg_scores: num/deno};
};"""


database.reviews.map_reduce(map_fxn,reduce_fxn,"weighted_avg_scores")

#print the first 100 entires of "weighted_avg_scores" to screen.
for record in database.weighted_avg_scores.find().limit(100):
    print(record)

{'_id': 'B00IZ4LXXG', 'value': {'weighted_avg_scores': 1.0}}
{'_id': 'B00FP3X8GA', 'value': {'weighted_avg_scores': 2.0}}
{'_id': 'B00EOX4T74', 'value': {'weighted_avg_scores': 0.0}}
{'_id': 'B00GURPEVC', 'value': {'weighted_avg_scores': 0.0}}
{'_id': 'B00F07T7OI', 'value': {'weighted_avg_scores': 0.0}}
{'_id': 'B00GWLLRMQ', 'value': {'weighted_avg_scores': 0.0}}
{'_id': 'B00EEEPPJO', 'value': {'weighted_avg_scores': 0.0}}
{'_id': 'B00F3I7HNM', 'value': {'weighted_avg_scores': 1.625}}
{'_id': 'B00E9JLFX4', 'value': {'weighted_avg_scores': 0.6}}
{'_id': 'B00GN940GM', 'value': {'weighted_avg_scores': 0.6666666666666666}}
{'_id': 'B00G3R2XR2', 'value': {'weighted_avg_scores': 0.0}}
{'_id': 'B00KMVJR60', 'value': {'weighted_avg_scores': 0.0}}
{'_id': 'B00EQ1V4XQ', 'value': {'weighted_avg_scores': 0.0}}
{'_id': 'B00GLLAUIO', 'value': {'weighted_avg_scores': 0.0}}
{'_id': 'B00HXE4H28', 'value': {'weighted_avg_scores': 6.8}}
{'_id': 'B00J4QLBXG', 'value': {'weighted_avg_scores': 0.0}}
{'_id':