# Clean updated data and build index
Author: Daheng Wang  
Last modified: 20170422

## Road map
1. Clean updated data
2. Build necessary indexes

### Initialization

In [24]:
import pymongo
import codecs
import os
import json
from pymongo import IndexModel, ASCENDING, DESCENDING
import importlib
from pprint import pprint

import mongodb # module for setting up connection with (local) MongoDB database

In [2]:
DB_NAME = 'tweets_ek' # database for tweets collected on expanded keywords
RAW_COLLECTION_NAME = 'c1' # collection for raw data
UPDATED_COLLECTION_NAME = 'c2' # collection for updated data

updated_data = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COLLECTION_NAME)

MongoDB on localhost:27017 connected successfully!


### Clean updated data
#### Clean tweets with no user field (probabaly due to server error)
_Step 1_ check how many tweets in updated dataset have no 'user' field, i.e. with no authoer information

In [3]:
if 0 == 1:
    %time no_user_n = updated_data.count(filter={'user': {'$exists': False}})
    print("Tweets with no 'user' field: {}".format(no_user_n))

CPU times: user 80 ms, sys: 20 ms, total: 100 ms
Wall time: 3min 15s
Tweets with no 'user' field: 0


~~_Step 2_ delete all tweets identified in _Step 1_~~

#### Clean server side messages mixed in tweets
_Step 1_ check how many server side message are mixed with tweets in updated database  
Example server side message:
```
{'_id': ObjectId('58be5f60122f5048614f8a2b'), 'limit': {'track': 3, 'timestamp_ms': '1486554726779'}}
```
Server side messages have no 'id' or 'id_str' fields of tweets

In [4]:
if 0 == 1:
    %time messages_n = updated_data.count(filter={'id': {'$exists': False}})
    print('Server side messages: {}'.format(messages_n))

CPU times: user 44 ms, sys: 0 ns, total: 44 ms
Wall time: 1min 59s
Server side messages: 0


~~_Step 2_ delete all server side message identified in _Step 1_~~

#### ~~Clean~~ Select out non-English tweets
_Step 1_ check how many tweets are in English/non-English in updated dataset  
English tweets have 'lang' field equals 'en'. Non-english tweets have 'lang' field equals null/'und'/other_lang_identifier.  
See https://dev.twitter.com/overview/api/tweets

In [5]:
if 0 == 1:
    total_tweets_n = updated_data.count()
    %time en_tweets_n = updated_data.count(filter={'lang': {'$eq': 'en'}})
    print('Total tweets: {}'.format(total_tweets_n))
    print('English tweets: {} ({})'.format(en_tweets_n, en_tweets_n / total_tweets_n))
    non_en_tweets_n = total_tweets_n - en_tweets_n
    print('non-English tweets: {} ({})'.format(non_en_tweets_n, non_en_tweets_n / total_tweets_n))

CPU times: user 32 ms, sys: 12 ms, total: 44 ms
Wall time: 1min 45s
Total tweets: 5043587
English tweets: 3754627 (0.7444358548786806)
non-English tweets: 1288960 (0.2555641451213194)


_Step 2_ separate English and non-English tweets into two new collections in MongoDB database

In [6]:
EN_UPDATED_COLLETION_NAME = 'c2_en'
NONEN_UPDATED_COLLETION_NAME = 'c2_nonen'

In [8]:
if 0 == 1:
    # select out English tweets into a new collection
    en_match_dic = {'$match': {'lang': {'$eq': 'en'}}}
    en_out_dic = {'$out': EN_UPDATED_COLLETION_NAME}
    en_pipeline_list = [en_match_dic, en_out_dic]
    %time updated_data.aggregate(pipeline=en_pipeline_list)

CPU times: user 156 ms, sys: 52 ms, total: 208 ms
Wall time: 10min 30s


Check new collection for English updated tweets

In [13]:
en_updated_data = mongodb.initialize(db_name=DB_NAME, collection_name=EN_UPDATED_COLLETION_NAME)
en_updated_n = en_updated_data.count()
print('{} English tweets in new collection: {}'.format(en_updated_n, EN_UPDATED_COLLETION_NAME))

MongoDB on localhost:27017 connected successfully!
3754627 English tweets in new collection: c2_en


In [10]:
if 0 == 1:
    # select out non-English tweets into another new collection
    nonen_match_dic = {'$match': {'lang': {'$ne': 'en'}}}
    nonen_out_dic = {'$out': NONEN_UPDATED_COLLETION_NAME}
    nonen_pipeline_list = [nonen_match_dic, nonen_out_dic]
    %time updated_data.aggregate(pipeline=nonen_pipeline_list)

CPU times: user 104 ms, sys: 36 ms, total: 140 ms
Wall time: 3min 59s


Check new collection for non-English updated tweets

In [14]:
nonen_updated_data = mongodb.initialize(db_name=DB_NAME, collection_name=NONEN_UPDATED_COLLETION_NAME)
nonen_updated_n = nonen_updated_data.count()
print('{} non-English tweets in new collection: {}'.format(nonen_updated_n, NONEN_UPDATED_COLLETION_NAME))

MongoDB on localhost:27017 connected successfully!
1288960 non-English tweets in new collection: c2_nonen


### Build necesary indexes
Build necessy indexes on updated collections to speedup queries.  
This takes quite long time, make sure to set ```background=True``` to run in background.

In [16]:
# IndexModel instances for tweets
id_index = IndexModel([('id', ASCENDING)], background=True)
id_str_index = IndexModel([('id_str', ASCENDING)], background=True)

# IndexModel instances for users
user_id_index = IndexModel([('user.id', ASCENDING)], background=True)
user_id_str_index = IndexModel([('user.id_str', ASCENDING)], background=True)
user_screen_name_index = IndexModel([('user.screen_name', ASCENDING)], background=True)

indexes_list = [id_index, id_str_index, user_id_index, user_id_str_index, user_screen_name_index]

#### Build indexes on cleaned updated data collection

In [None]:
if 0 == 1:
    updated_data.create_indexes(indexes=indexes_list)

#### Build indexes on English cleaned updated data collection

In [None]:
if 0 == 1:
    en_updated_data.create_indexes(indexes=indexes_list)

#### Build indexes on non-English cleaned updated data collection

In [None]:
if 0 == 1:
    nonen_updated_data.create_indexes(indexes=indexes_list)

#### Make sure indexes are created successfully
Check current MongoDB operations on database level

In [None]:
if 0 == 1:
#     m = importlib.import_module('mongodb')
#     importlib.reload(m)
    
    db = mongodb.initialize_db(db_name=DB_NAME)
    current_ops = db.current_op()
    pprint(current_ops)

List indexes for each collection

In [None]:
if 0 == 1:
    pprint(updated_data.index_information())
    pprint(en_updated_data.index_information())
    pprint(nonen_updated_data.index_information())