In [34]:
import os
import json
import pymongo
from pymongo import InsertOne, DeleteMany
from pymongo.errors import BulkWriteError
from bz2 import BZ2File as bzopen

In [2]:
database_name = "persistence_db"
persistence_files = "persistence_files"
persistence_revisions = "persistence_revisions"
persistence_file_pages = "persistence_file_pages"

client = pymongo.MongoClient(host="localhost", port=27017)
db = client.get_database(name=database_name)
persistence_revisions_collection = db.get_collection(name=persistence_revisions)
persistence_files_collection = db.get_collection(name=persistence_files)
persistence_file_pages_collection = db.get_collection(name=persistence_file_pages)

In [None]:
# persistence_revisions_collection.count()

In [None]:
# persistence_revisions_collection.find_one({ '_id': 363038388 })

## Loading Wikipedia articles into MongoDB

In [68]:
database_name = "enwiki"
pages_collection_name = 'pages'
client = pymongo.MongoClient(host="localhost", port=27017)
db = client.get_database(name=database_name)
pages_collection = db.get_collection(name=pages_collection_name)

In [86]:
def process_file(file_name):
    print(file_name)
    articles = []
    with bzopen(file_name, "r") as bzfin:
        for i, line in enumerate(bzfin):
            articles.append(json.loads(line))
    save_bulk(articles)
        
def save_bulk(articles):
    requests = []
    for article in articles:
        requests.append(InsertOne({'_id': int(article['id']), 
                                   'title': article['title'], 
                                   'text': article['text'], 
                                   'url': article['url']}))
    try:
        result = pages_collection.bulk_write(requests)
        print('Total articles: {}\tTotal inserted: {}'.format(len(articles), result.inserted_count))
    except BulkWriteError as bwe:
        print(bwe.details)
    except Exception as err:
        print(err)

In [87]:
for root, dirs, files in os.walk(r'D:/data/enwiki'):
    for file in files:
        process_file(file_name=os.path.join(root, file))

D:/data/enwiki\AA\wiki_00.bz2
Total articles: 228	Total inserted: 228
D:/data/enwiki\AA\wiki_01.bz2
Total articles: 371	Total inserted: 371
D:/data/enwiki\AA\wiki_02.bz2
Total articles: 326	Total inserted: 326
D:/data/enwiki\AA\wiki_03.bz2
Total articles: 411	Total inserted: 411
D:/data/enwiki\AA\wiki_04.bz2
Total articles: 321	Total inserted: 321
D:/data/enwiki\AA\wiki_05.bz2
Total articles: 270	Total inserted: 270
D:/data/enwiki\AA\wiki_06.bz2
Total articles: 300	Total inserted: 300
D:/data/enwiki\AA\wiki_07.bz2
Total articles: 272	Total inserted: 272
D:/data/enwiki\AA\wiki_08.bz2
Total articles: 271	Total inserted: 271
D:/data/enwiki\AA\wiki_09.bz2
Total articles: 309	Total inserted: 309
D:/data/enwiki\AA\wiki_10.bz2
Total articles: 316	Total inserted: 316
D:/data/enwiki\AA\wiki_11.bz2
Total articles: 358	Total inserted: 358
D:/data/enwiki\AA\wiki_12.bz2
Total articles: 303	Total inserted: 303
D:/data/enwiki\AA\wiki_13.bz2
Total articles: 284	Total inserted: 284
D:/data/enwiki\AA\wi

In [85]:
pages_collection.delete_many(filter={})

<pymongo.results.DeleteResult at 0x24f16e38948>

In [83]:
pages_collection.count()

0

In [96]:
result = pages_collection.find_one({ '_id': 1000 })
print(result['text'][:1000])

Hercule Poirot

Hercule Poirot (, ; ) is a fictional Belgian detective, created by Agatha Christie. Poirot is one of Christie's most famous and long-lived characters, appearing in 33 novels, one play ("Black Coffee"), and more than 50 short stories published between 1920 and 1975.

Poirot has been portrayed on radio, in film and on television by various actors, including Austin Trevor, John Moffatt, Albert Finney, Sir Peter Ustinov, Sir Ian Holm, Tony Randall, Alfred Molina, Orson Welles, David Suchet and Sir Kenneth Branagh.

Poirot's name was derived from two other fictional detectives of the time: Marie Belloc Lowndes' Hercule Popeau and Frank Howel Evans' Monsieur Poiret, a retired Belgian police officer living in London.

A more obvious influence on the early Poirot stories is that of Arthur Conan Doyle. In "An Autobiography", Christie states, "I was still writing in the Sherlock Holmes tradition – eccentric detective, stooge assistant, with a Lestrade-type Scotland Yard detective

In [112]:
pages_collection.find_one({"_id": 1000}, {"title": 1})

{'_id': 1000, 'title': 'Hercule Poirot'}

In [117]:
%%time

links = []
counter = 1
while counter < 1000:
    links.append({'ns': 0, 'title': '{}'.format(counter)})
    counter += 1

test_collection = db.get_collection(name='test')
test_collection.insert_one({'_id': 123, 'links': links})

Wall time: 12 ms


In [120]:
test_collection.find_one({"_id": 123})['links']


[{'ns': 0, 'title': '1'},
 {'ns': 0, 'title': '2'},
 {'ns': 0, 'title': '3'},
 {'ns': 0, 'title': '4'},
 {'ns': 0, 'title': '5'},
 {'ns': 0, 'title': '6'},
 {'ns': 0, 'title': '7'},
 {'ns': 0, 'title': '8'},
 {'ns': 0, 'title': '9'},
 {'ns': 0, 'title': '10'},
 {'ns': 0, 'title': '11'},
 {'ns': 0, 'title': '12'},
 {'ns': 0, 'title': '13'},
 {'ns': 0, 'title': '14'},
 {'ns': 0, 'title': '15'},
 {'ns': 0, 'title': '16'},
 {'ns': 0, 'title': '17'},
 {'ns': 0, 'title': '18'},
 {'ns': 0, 'title': '19'},
 {'ns': 0, 'title': '20'},
 {'ns': 0, 'title': '21'},
 {'ns': 0, 'title': '22'},
 {'ns': 0, 'title': '23'},
 {'ns': 0, 'title': '24'},
 {'ns': 0, 'title': '25'},
 {'ns': 0, 'title': '26'},
 {'ns': 0, 'title': '27'},
 {'ns': 0, 'title': '28'},
 {'ns': 0, 'title': '29'},
 {'ns': 0, 'title': '30'},
 {'ns': 0, 'title': '31'},
 {'ns': 0, 'title': '32'},
 {'ns': 0, 'title': '33'},
 {'ns': 0, 'title': '34'},
 {'ns': 0, 'title': '35'},
 {'ns': 0, 'title': '36'},
 {'ns': 0, 'title': '37'},
 {'ns': 0,

In [113]:
test_collection.delete_many(filter={})

<pymongo.results.DeleteResult at 0x24f1b1fa2c8>

In [119]:
test_collection.count()

1