# Identify user's affiliation with IBM 
Author: Daheng Wang  
Last modified: 2017-05-26

# Roadmap
1. Build collections of user information
2. Identify affiliation based on the 'description' field of user
3. Check results
4. Pickle results into local files

# Steps

In [1]:
"""
Initialization
"""

'''
Data analysis modules: pandas, matplotlib, numpy, and etc.
'''
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # render double resolution plot output for Retina screens 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

'''
Standard modules, MongoDB modules
'''
import os, sys, json, datetime, pickle
from pprint import pprint

import pymongo
from pymongo import IndexModel, ASCENDING, DESCENDING

'''
Custom tool modules
'''
import mongodb  # module for setting up connection with (local) MongoDB database
import multiprocessing_workers  # module for splitting workloads between processes
import utilities  # module for various custom utility functions
from config import * # import all global configuration variables

NB_NAME = '20170503-user_affiliation'

## Build collections of user information

In [4]:
%%time
"""
Build a new collection for unique users of all tweets (based on TW_RAW_COL)
Register in config:
    USER_RAW_COL = 'user_raw'
"""
if 1 == 1:
    '''
    Load in user ids list of all tweets from pickle
    '''
    total_unique_user_ids_lst = []
    
    print('Loading user ids list...')
    with open (USER_RAW_IDS_LST_PKL, 'rb') as f:
        total_unique_user_ids_lst = pickle.load(f)
    print('Number of total users: {}'.format(len(total_unique_user_ids_lst)))
    
    '''
    Extract user information and write to new collection
    '''
    
    db = mongodb.initialize_db(DB_NAME)
    print('Querying and updating database...')
    for unique_user_id in total_unique_user_ids_lst:
        doc = db[TW_RAW_COL].find_one(filter={'user.id': unique_user_id},
                                      projection={'_id': 0, 'user': 1},
                                      sort=[('_id', pymongo.DESCENDING)]) # return the latest user information
        user_dict = doc['user']
        db[USER_RAW_COL].insert_one(user_dict) # insert into user collection
    
    user_count = db[USER_RAW_COL].count()
    print('Collection size: {}'.format(user_count))

Loading user ids list...
Number of total users: 1469738
MongoDB on localhost:27017/tweets_ek-2 connected successfully!
Querying and updating database...
Collection size: 1469738
CPU times: user 23min 31s, sys: 1min 58s, total: 25min 30s
Wall time: 5h 30min 50s


In [2]:
%%time
"""
Build a new collection for unique users of native tweets (based on TW_NT_COL)
Register in config:
    USER_NT_COL = 'user_nt'
"""
if 0 == 1:
    '''
    Load in user ids list of native tweets from pickle
    '''
    nt_unique_user_ids_lst = []
    
    print('Loading user ids list...')
    with open (USER_NT_IDS_LST_PKL, 'rb') as f:
        nt_unique_user_ids_lst = pickle.load(f)
    print('Number of unique users: {}'.format(len(nt_unique_user_ids_lst)))
    
    '''
    Extract user information and write to new collection
    '''
    db = mongodb.initialize_db(DB_NAME)
    print('Querying and updating database...')
    for nt_user_id in nt_unique_user_ids_lst:
        doc = db[TW_NT_COL].find_one(filter={'user.id': nt_user_id},
                                      projection={'_id': 0, 'user': 1},
                                      sort=[('_id', pymongo.DESCENDING)]) # return the latest user information
        user_dict = doc['user']
        db[USER_NT_COL].insert_one(user_dict) # insert into user collection
    
    user_count = db[USER_NT_COL].count()
    print('Collection size: {}'.format(user_count))

Loading user ids list...
Number of unique users: 609799
MongoDB on localhost:27017/tweets_ek-2 connected successfully!
Querying and updating database...
Collection size: 609799
CPU times: user 8min 30s, sys: 46.4 s, total: 9min 16s
Wall time: 31min 20s


In [2]:
%%time
"""
Build a new collection for unique users of retweets (based on TW_RT_COL)
Register in config:
    USER_RT_COL = 'user_rt'
"""
if 1 == 1:
    '''
    Load in user ids list of native tweets from pickle
    '''
    rt_unique_user_ids_lst = []
    
    print('Loading user ids list...')
    with open (USER_RT_IDS_LST_PKL, 'rb') as f:
        rt_unique_user_ids_lst = pickle.load(f)
    print('Number of unique users: {}'.format(len(rt_unique_user_ids_lst)))
    
    '''
    Extract user information and write to new collection
    '''
    db = mongodb.initialize_db(DB_NAME)
    print('Querying and updating database...')
    for rt_user_id in rt_unique_user_ids_lst:
        doc = db[TW_RT_COL].find_one(filter={'user.id': rt_user_id},
                                      projection={'_id': 0, 'user': 1},
                                      sort=[('_id', pymongo.DESCENDING)]) # return the latest user information
        user_dict = doc['user']
        db[USER_RT_COL].insert_one(user_dict) # insert into user collection
    
    user_count = db[USER_RT_COL].count()
    print('Collection size: {}'.format(user_count))

Loading user ids list...
Number of unique users: 1036781
MongoDB on localhost:27017/tweets_ek-2 connected successfully!
Querying and updating database...
Collection size: 1036781
CPU times: user 15min 30s, sys: 1min 11s, total: 16min 41s
Wall time: 1h 54min 50s


## Identify affiliation based on the 'description' field on user

**Idea**: if the keyword 'ibm' exists in a user's 'description' field, we say the user is _directly affiliated_ with IBM.  
**Problems**:
 - not all users have 'description' field filled
 - cannot distinguish between different types of user (individuals, official accounts, media outlet...)

In [2]:
%%time
"""
Build a new collection for the binary results of whether keyword 'ibm' appears in users' 'description' field.
Register in config:
    USER_NT_DESC_TAG_COL = 'user_nt_desc_tag'
"""
if 1 == 1:
    '''
    For each user in the "USER_NT_COL" collection, extract the "id", "followers_count", and "description" field.
    '''
    db = mongodb.initialize_db(db_name=DB_NAME)
    user_nt_col = db[USER_NT_COL]
    cursor = user_nt_col.find(projection={'_id': 0, 'id': 1, 'followers_count': 1, 'description': 1}, # minimize I/O bandwith
                              sort=[('_id', pymongo.ASCENDING)])
    
    
    '''
    For each extracted user, tag whether keyword "ibm" exists in its "description" field.
    Store the bool results in a new field "X_desc_tag".
    '''
    keyword = 'ibm' # the keyword indicates affiliation with IBM
    data_lst = []
    
    print("Processing 'description' field of users...")
    for doc in cursor:
        data_lst.append(doc)
        current_user = data_lst[-1] # get the current user (doc)
        # add a new field 'X_desc_tag' for bool result of whether the keyword exists in 'description' field
        current_user['X_desc_tag'] = utilities.simple_test_keyword_in_text(text=current_user['description'], keyword=keyword)
    print('Processed users: {}'.format(len(data_lst)))
    
    '''
    Output into new collection USER_NT_DESC_TAG_COL
    '''
    print('Creating new collection: "{}"...'.format(USER_NT_DESC_TAG_COL))
    if USER_NT_DESC_TAG_COL in db.collection_names():
        print('\tAlready exist! Dropping...')
        db[USER_NT_DESC_TAG_COL].drop()
    
    user_nt_desc_tag_col = db[USER_NT_DESC_TAG_COL]
    print('Inserting into "{}"...'.format(USER_NT_DESC_TAG_COL))
    user_nt_desc_tag_col.insert_many(data_lst)
    
    count = user_nt_desc_tag_col.count()
    print('New collection size: {}'.format(count))

MongoDB on localhost:27017/tweets_ek-2 connected successfully!
Processing 'description' field of users...
Processed users: 609799
Creating new collection: "user_nt_desc_tag"...
Inserting into "user_nt_desc_tag"...
New collection size: 609799
CPU times: user 28.9 s, sys: 1.71 s, total: 30.6 s
Wall time: 42.3 s


## Check results

In [3]:
db = mongodb.initialize_db(db_name=DB_NAME)
user_nt_col = db[USER_NT_COL]
user_nt_desc_tag_col = db[USER_NT_DESC_TAG_COL]

MongoDB on localhost:27017/tweets_ek-2 connected successfully!


In [4]:
'''
Number of unqiue users of native tweets
'''
user_nt_num = user_nt_col.count()
print('Unique users of native tweets: {}'.format(user_nt_num))

Unique users of native tweets: 609799


In [9]:
'''
Number of unqiue users of native tweets with non-empty "description" field
'''
user_nt_nonem_desc_num = user_nt_col.count(filter={'description': {'$ne': ''}})
print('Unique users of native tweets with non-empty "description" field: {} ({:.2%})'.format(user_nt_num, (user_nt_nonem_desc_num / user_nt_num)))

Unique users of native tweets with non-empty "description" field: 609799 (100.00%)


In [10]:
'''
Number of IBM users
'''
user_nt_ibm_desc_num = user_nt_desc_tag_col.count(filter={'X_desc_tag': {'$eq': True}})
print('IBM users: {} ({:.2%} out of total, {:.2%} out of nonempty desc field users)'.format(user_nt_ibm_desc_num, 
                                                                                            (user_nt_ibm_desc_num / user_nt_num),
                                                                                            (user_nt_ibm_desc_num / user_nt_nonem_desc_num)))

IBM users: 6271 (1.03% out of total, 1.03% out of nonempty desc field users)


## Pickle results into local files

In [3]:
"""
Make two new global pickle files: lists if IBM/non-IBM user ids
Register in config:
    USER_NT_IBM_DESC_IDS_LST_PKL
    USER_NT_NONIBM_DESC_IDS_LST_PKL
"""
if 1 == 1:
    '''
    pickle ids list of IBM users
    '''
    db = mongodb.initialize_db(db_name=DB_NAME)
    user_nt_desc_tag_col = db[USER_NT_DESC_TAG_COL]
    cursor = user_nt_desc_tag_col.find(filter={'X_desc_tag': {'$eq': True}}, projection={'_id': 0, 'id': 1})
    
    print('Building ids list of IBM users...')
    user_nt_ibm_desc_ids_lst = []
    for doc in cursor:
        user_nt_ibm_desc_id = int(doc['id'])
        user_nt_ibm_desc_ids_lst.append(user_nt_ibm_desc_id)
    print('List length: {}'.format(len(user_nt_ibm_desc_ids_lst)))
    
    print('Making pickle: {}'.format(USER_NT_IBM_DESC_IDS_LST_PKL))
    with open(USER_NT_IBM_DESC_IDS_LST_PKL, 'wb') as f:
        pickle.dump(user_nt_ibm_desc_ids_lst, f)
    print('Done')

if 1 == 1:
    '''
    pickle ids list of non-IBM users
    '''
    db = mongodb.initialize_db(db_name=DB_NAME)
    user_nt_desc_tag_col = db[USER_NT_DESC_TAG_COL]
    cursor = user_nt_desc_tag_col.find(filter={'X_desc_tag': {'$eq': False}}, projection={'_id': 0, 'id': 1})
    
    print('Building ids list of non-IBM users...')
    user_nt_nonibm_desc_ids_lst = []
    for doc in cursor:
        user_nt_nonibm_desc_id = int(doc['id'])
        user_nt_nonibm_desc_ids_lst.append(user_nt_nonibm_desc_id)
    print('List length: {}'.format(len(user_nt_nonibm_desc_ids_lst)))  
    
    print('Making pickle: {}'.format(USER_NT_NONIBM_DESC_IDS_LST_PKL))
    with open(USER_NT_NONIBM_DESC_IDS_LST_PKL, 'wb') as f:
        pickle.dump(user_nt_nonibm_desc_ids_lst, f)
    print('Done')

MongoDB on localhost:27017/tweets_ek-2 connected successfully!
Building ids list of IBM users...
List length: 6271
Making pickle: ./data/user_nt_ibm_desc_ids.lst.pkl
Done
MongoDB on localhost:27017/tweets_ek-2 connected successfully!
Building ids list of non-IBM users...
List length: 603528
Making pickle: ./data/user_nt_nonibm_desc_ids.lst.pkl
Done


## Check whehter accounts @Natasha_D_G and @jameskobielus are included.

One special task, we check two accounts for two known IBM affiliated individual.
1. @Natasha_D_G (Natasha Bishop): https://twitter.com/Natasha_D_G?lang=en
2. @jameskobielus (James Kobielus): https://twitter.com/jameskobielus?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor

In [6]:
"""
Check if these two individuals exist in database
"""
if 0 == 1:
    user_nt_col = mongodb.initialize(db_name=DB_NAME, collection_name=USER_NT_COL)
    
    screen_name_1 = 'Natasha_D_G'
    doc = user_nt_col.find_one(filter={'screen_name': screen_name_1})
    if doc:
        print('User found: {}'.format(screen_name_1))
        pprint(doc)
    else:
        print('User not found: {}'.format(screen_name_1))
        
    screen_name_2 = 'jameskobielus'
    doc = user_nt_col.find_one(filter={'screen_name': screen_name_2})
    if doc:
        print('User found: {}'.format(screen_name_2))
        pprint(doc)
    else:
        print('User not found: {}'.format(screen_name_2))

MongoDB on localhost:27017/tweets_ek-2.user_nt connected successfully!
User found: Natasha_D_G
{'_id': ObjectId('592725f0fe57a1210e4ade3a'),
 'contributors_enabled': False,
 'created_at': 'Tue May 12 02:37:34 +0000 2009',
 'default_profile': False,
 'default_profile_image': False,
 'description': 'Digital #Marketing & Public Sector Lead for #IBM. Top 100 for '
                '#Bigdata & #IoT  http://ibm.co/1PgNsAk Tweets = mine & any '
                'and all #sports get me excited!',
 'favourites_count': 7048,
 'follow_request_sent': None,
 'followers_count': 4904,
 'following': None,
 'friends_count': 3569,
 'geo_enabled': False,
 'id': 39413322,
 'id_str': '39413322',
 'is_translator': False,
 'lang': 'en',
 'listed_count': 1154,
 'location': 'U.S.A.',
 'name': 'Natasha Bishop',
 'notifications': None,
 'profile_background_color': '709397',
 'profile_background_image_url': 'http://pbs.twimg.com/profile_background_images/862350354/02bd13aad1c0532aefcb818ec943a6ca.jpeg',
 'profile_b

We see that both individuals were captured and exist in our database.
1. @Natasha_D_G (Natasha Bishop). 'id': 39413322
2. @jameskobielus (James Kobielus): 'id': 14072398

Besides, we also see that @Natasha_D_G has keyword 'ibm' in her description while @jameskobielus does not. We expect to see that @Natasha_D_G appear in our identified IBM-user set here.

In [8]:
if 1 == 1:
    id_1 = 39413322
    id_2 = 14072398
    
    user_nt_ibm_desc_ids_lst = []
    with open(USER_NT_IBM_DESC_IDS_LST_PKL, 'rb') as f:
        user_nt_ibm_desc_ids_lst = pickle.load(f)
        
    print('User {} exists in IBM-user set? {}'.format(screen_name_1, id_1 in set(user_nt_ibm_desc_ids_lst)))
    print('User {} exists in IBM-user set? {}'.format(screen_name_2, id_2 in set(user_nt_ibm_desc_ids_lst)))

User Natasha_D_G exists in IBM-user set? True
User jameskobielus exists in IBM-user set? False


# Notes