# Parse fields in (cleaned) updated data
Author: Daheng Wang  
Last modified: 20170425

## Road Map
1. Parse necessary fields in cleaned updated data
2. Build a new collection of unique users information

### Initialization

In [1]:
import pymongo
import codecs
import os
import json
from pymongo import IndexModel, ASCENDING, DESCENDING
import importlib
from pprint import pprint
import multiprocessing
import logging
import datetime
import shelve

import mongodb # module for setting up connection with (local) MongoDB database
import multiprocessing_workers # module for splitting workloads between processes
import utilities # module for various custom utility functions

In [2]:
DB_NAME = 'tweets_ek' # database for tweets collected on expanded keywords
RAW_COLLECTION_NAME = 'c1' # collection for raw data
UPDATED_COLLECTION_NAME = 'c2' # collection for updated data

updated_data = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COLLECTION_NAME)

# en_updated_data = mongodb.initialize(db_name=DB_NAME, collection_name=EN_UPDATED_COLLETION_NAME)
# nonen_updated_data = mongodb.initialize(db_name=DB_NAME, collection_name=NONEN_UPDATED_COLLETION_NAME)
# db = mongodb.initialize_db(db_name=DB_NAME)

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!


### Parse necesary fields in cleaned updated data

#### Parse 'created_at' field

The 'created_at' filed of tweets received from Twitter API contains a fixed format of string representing the datatime information  
Example: ```Tue Feb 07 04:59:37 +0000 2017```  
This default string representation of datetime cannot be efficiently processed by MongoDB database, especially in aggregation operations. We parse it into a universal datetime representation format: Unix timestamp (in milliseconds).

_Step 1_ check how many tweets have 'timestamp_ms' field

In [3]:
if 0 == 1:
    total_n = updated_data.count()
    %time timestamp_ms_n = updated_data.count(filter={'timestamp_ms': {'$exists': True}})
    print("Tweets with 'timestamp_ms' field: {} ({})".format(timestamp_ms_n, timestamp_ms_n / total_n))

**NOTE Tweets received from Streaming API contain 'timestamp_ms' field. Tweets queried from REST API do not have 'timestamp_ms' field.**

_Step 2_ use multiple ~~threads~~ (see http://www.dabeaz.com/GIL/ and https://jeffknupp.com/blog/2013/06/30/pythons-hardest-problem-revisited/ for Python GIL problem) **processes** to compute concurrently

Besides, worker functions are wrapped in multiprocessing_workers.py file (see https://pymotw.com/3/multiprocessing/basics.html)

In [4]:
%%time
inter_files = []
if 0 == 1:
    procedure_name = 'parse_{}_created_at'.format(UPDATED_COLLECTION_NAME)
    
    multiprocessing.log_to_stderr(logging.DEBUG)
    process_n = multiprocessing.cpu_count() - 1 # set processes number to CPU numbers minus 1
    suffix = 'json'
    inter_files = utilities.gen_inter_filenames_list(procedure_name, process_n, suffix)
    
    jobs = []
    for batch_i in range(process_n):
        p = multiprocessing.Process(target=multiprocessing_workers.worker_parse_created_at,
                                    args=(DB_NAME, UPDATED_COLLECTION_NAME, batch_i, process_n, inter_files[batch_i]),
                                    name='Process-{}/{}'.format(batch_i, process_n))
        jobs.append(p)
    
    for job in jobs:
        job.start()
        
    for job in jobs:
        job.join()

[INFO/Process-0/11] child process calling self.run()
[INFO/Process-1/11] child process calling self.run()
[INFO/Process-2/11] child process calling self.run()


MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!


[INFO/Process-3/11] child process calling self.run()


Process0/11 handling documents 0 to 458506...


[INFO/Process-4/11] child process calling self.run()


MongoDB on localhost:27017/tweets_ek.c2 connected successfully!


[INFO/Process-5/11] child process calling self.run()


MongoDB on localhost:27017/tweets_ek.c2 connected successfully!


[INFO/Process-6/11] child process calling self.run()


MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!


[INFO/Process-7/11] child process calling self.run()
[INFO/Process-8/11] child process calling self.run()


MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!


[INFO/Process-9/11] child process calling self.run()
[INFO/Process-10/11] child process calling self.run()


MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
Process1/11 handling documents 458507 to 917013...
Process2/11 handling documents 917014 to 1375520...
Process3/11 handling documents 1375521 to 1834027...
Process4/11 handling documents 1834028 to 2292534...
Process5/11 handling documents 2292535 to 2751041...
Process6/11 handling documents 2751042 to 3209548...
Process7/11 handling documents 3209549 to 3668055...
Process8/11 handling documents 3668056 to 4126562...
Process9/11 handling documents 4126563 to 4585069...
Process10/11 handling documents 4585070 to 5043587...


[INFO/Process-0/11] process shutting down
[DEBUG/Process-0/11] running all "atexit" finalizers with priority >= 0
[DEBUG/Process-0/11] running the remaining "atexit" finalizers
[INFO/Process-0/11] process exiting with exitcode 0
[INFO/Process-1/11] process shutting down
[DEBUG/Process-1/11] running all "atexit" finalizers with priority >= 0
[DEBUG/Process-1/11] running the remaining "atexit" finalizers
[INFO/Process-1/11] process exiting with exitcode 0
[INFO/Process-2/11] process shutting down
[DEBUG/Process-2/11] running all "atexit" finalizers with priority >= 0
[DEBUG/Process-2/11] running the remaining "atexit" finalizers
[INFO/Process-2/11] process exiting with exitcode 0
[INFO/Process-3/11] process shutting down
[DEBUG/Process-3/11] running all "atexit" finalizers with priority >= 0
[DEBUG/Process-3/11] running the remaining "atexit" finalizers
[INFO/Process-3/11] process exiting with exitcode 0
[INFO/Process-4/11] process shutting down
[DEBUG/Process-4/11] running all "atexit" 

CPU times: user 96 ms, sys: 92 ms, total: 188 ms
Wall time: 2min 39s


_Step 3_ Import all parsed data back into database new collection

In [8]:
%%time
PARSED_CREATED_AT_COLLECTION = 'c2_parsed_created_at'
if 0 == 1:
    parsed_created_at_col = mongodb.initialize(db_name=DB_NAME, collection_name=PARSED_CREATED_AT_COLLECTION)
    for inter_file in inter_files:
        print('Reading {}...'.format(inter_file), end=' ')
        lines = open(inter_file).readlines()
        parsed_jsons = [json.loads(line) for line in lines]
        
        # it's important to reconstruct datetime.datetime obj back
        # otherwise, the 'created_at_parsed' field cannot be imported into MongoDB
        # http://api.mongodb.com/python/1.3/tutorial.html
        reconstructed_jsons = [{'id': int(parsed_json['id']), 
                               'created_at_parsed': datetime.datetime.fromtimestamp(parsed_json['created_at_parsed'])} 
                              for parsed_json in parsed_jsons]
        print('Importing into {}.{}...'.format(DB_NAME, PARSED_CREATED_AT_COLLECTION))
        parsed_created_at_col.insert_many(reconstructed_jsons)
    print('Done')

MongoDB on localhost:27017/tweets_ek.c2_parsed_created_at connected successfully!
Reading inter/parse_c2_created_at-0.json...
Importing into tweets_ek.c2_parsed_created_at
Reading inter/parse_c2_created_at-1.json...
Importing into tweets_ek.c2_parsed_created_at
Reading inter/parse_c2_created_at-2.json...
Importing into tweets_ek.c2_parsed_created_at
Reading inter/parse_c2_created_at-3.json...
Importing into tweets_ek.c2_parsed_created_at
Reading inter/parse_c2_created_at-4.json...
Importing into tweets_ek.c2_parsed_created_at
Reading inter/parse_c2_created_at-5.json...
Importing into tweets_ek.c2_parsed_created_at
Reading inter/parse_c2_created_at-6.json...
Importing into tweets_ek.c2_parsed_created_at
Reading inter/parse_c2_created_at-7.json...
Importing into tweets_ek.c2_parsed_created_at
Reading inter/parse_c2_created_at-8.json...
Importing into tweets_ek.c2_parsed_created_at
Reading inter/parse_c2_created_at-9.json...
Importing into tweets_ek.c2_parsed_created_at
Reading inter/pars

Check the new collection size and print a sample.

In [11]:
if 0 == 1:
    parsed_created_at_col = mongodb.initialize(db_name=DB_NAME, collection_name=PARSED_CREATED_AT_COLLECTION)
    print('Collection {} size: {}'.format(PARSED_CREATED_AT_COLLECTION, parsed_created_at_col.count()))
    print('Sample document:')
    pprint(parsed_created_at_col.find_one())

MongoDB on localhost:27017/tweets_ek.c2_parsed_created_at connected successfully!
Collection c2_parsed_created_at size: 5043587
Sample document:
{'_id': ObjectId('58fe568cfe57a16f94c04911'),
 'created_at_parsed': datetime.datetime(2017, 3, 6, 1, 8, 5),
 'id': 838632256252088320}




#### Parse 'user.description' field for keyword 'ibm'
Test whether the keyword 'ibm' exists in the 'user.description' field. If yes, we consider the user shows explicit affiliation with IBM; if no, we do not know if the user is affliated with IBM.

#### Parse 'root.text' field for different topics (keywords)

### Build a new collection of unique users information

_Step 1_ Get a set of unique user id

In [3]:
unique_user_ids_shl = os.path.join('data', 'unique_user_ids.db')
unique_user_ids_key = 'unique_user_ids'

In [4]:
if 0 == 1:
    print('Querying MongoDB for unique user ids...')
    unique_user_ids_int64_list = []
    cursor = updated_data.find(projection={'_id': 0, 'user.id': 1})
    for document in cursor:
        user_id_int64 = int(document['user']['id'])
        unique_user_ids_int64_list.append(user_id_int64)
    
    print('Building unique user ids set from list...')
    unique_user_ids_int64_set = set(unique_user_ids_int64_list)
    
    # write out to shelve
    print('Writing out user ids set to shelve {} size {}'.format(unique_user_ids_shl, len(unique_user_ids_int64_set)))
    with shelve.open(unique_user_ids_shl) as s:
        s[unique_user_ids_key] = unique_user_ids_int64_set # store data at key (overwrites old data if using an existing key)
    print('Done')

Querying MongoDB for unique user ids...
Building unique user ids set from list...
Writing out user ids set to shelve data/unique_user_ids.db...
Done
CPU times: user 1min, sys: 1.12 s, total: 1min 1s
Wall time: 4min 14s


_Step 2_ For each unique user id in the set, (multiprocessing) query database and write out to intermediate files

In [4]:
%%time
inter_files = []
if 0 == 1:
    # generate intermediate filenames
    procedure_name = 'get_{}_unique_user_ids'.format(UPDATED_COLLECTION_NAME)
    
    process_n = multiprocessing.cpu_count() - 1 # set processes number to CPU numbers minus 1
    suffix = 'json'
    inter_files = utilities.gen_inter_filenames_list(procedure_name, process_n, suffix)
    
    jobs = []
    for batch_i in range(process_n):
        p = multiprocessing.Process(target=multiprocessing_workers.worker_get_unique_user,
                                    args=(DB_NAME, UPDATED_COLLECTION_NAME,
                                          batch_i, process_n, inter_files[batch_i],
                                          unique_user_ids_shl, unique_user_ids_key),
                                    name='Process-{}/{}'.format(batch_i, process_n))
        jobs.append(p)
    
    for job in jobs:
        job.start()
        
    for job in jobs:
        job.join()

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
Process0/11 querying users 0 to 76788...
Process1/11 querying users 76788 to 153576...
Process2/11 querying users 153576 to 230364...
Process3/11 querying users 230364 to 307152...
Process4/11 querying users 307152 to 383940...
Process5/11 querying users 383940 to 460728...
Process6/11 querying 

_Step 3_ Import all unique user data into database new collection

In [5]:
%%time
USERS_COLLECTION = 'c2_users'
if 0 == 1:
    user_col = mongodb.initialize(db_name=DB_NAME, collection_name=USERS_COLLECTION)
    for inter_file in inter_files:
        print('Reading {}...'.format(inter_file), end=' ')
        parsed_jsons = []
        with open(inter_file, 'r') as f:
            for line in f:
                parsed_jsons.append(json.loads(line))
        print('Importing into {}.{}...'.format(DB_NAME, USERS_COLLECTION))
        user_col.insert_many(parsed_jsons)
    print('Done')

MongoDB on localhost:27017/tweets_ek.c2_users connected successfully!
Reading inter/get_c2_unique_user_ids-0.json... Importing into tweets_ek.c2_users...
Reading inter/get_c2_unique_user_ids-1.json... Importing into tweets_ek.c2_users...
Reading inter/get_c2_unique_user_ids-2.json... Importing into tweets_ek.c2_users...
Reading inter/get_c2_unique_user_ids-3.json... Importing into tweets_ek.c2_users...
Reading inter/get_c2_unique_user_ids-4.json... Importing into tweets_ek.c2_users...
Reading inter/get_c2_unique_user_ids-5.json... Importing into tweets_ek.c2_users...
Reading inter/get_c2_unique_user_ids-6.json... Importing into tweets_ek.c2_users...
Reading inter/get_c2_unique_user_ids-7.json... Importing into tweets_ek.c2_users...
Reading inter/get_c2_unique_user_ids-8.json... Importing into tweets_ek.c2_users...
Reading inter/get_c2_unique_user_ids-9.json... Importing into tweets_ek.c2_users...
Reading inter/get_c2_unique_user_ids-10.json... Importing into tweets_ek.c2_users...
Done


Check the new collection size and print a sample.

In [6]:
if 0 == 1:
    user_col = mongodb.initialize(db_name=DB_NAME, collection_name=USERS_COLLECTION)
    print('Collection {} size: {}'.format(USERS_COLLECTION, user_col.count()))
    print('Sample document:')
    pprint(user_col.find_one())

MongoDB on localhost:27017/tweets_ek.c2_users connected successfully!
Collection c2_users size: 844675
Sample document:
{'_id': ObjectId('58fed783fe57a10b2393c51e'),
 'contributors_enabled': False,
 'created_at': 'Tue Mar 21 20:50:14 +0000 2006',
 'default_profile': False,
 'default_profile_image': False,
 'description': '',
 'entities': {'description': {'urls': []}},
 'favourites_count': 16835,
 'follow_request_sent': False,
 'followers_count': 4028041,
 'following': False,
 'friends_count': 2677,
 'geo_enabled': True,
 'has_extended_profile': True,
 'id': 12,
 'id_str': '12',
 'is_translation_enabled': False,
 'is_translator': False,
 'lang': 'en',
 'listed_count': 27165,
 'location': 'California, USA',
 'name': 'jack',
 'notifications': False,
 'profile_background_color': 'EBEBEB',
 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme7/bg.gif',
 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme7/bg.gif',
 'profile_background_tile':