In [1]:
# Data analysis modules: pandas, matplotlib, numpy, and etc.
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # render double resolution plot output for Retina screens 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Standard modules, MongoDB modules
import os, sys, json, datetime, pickle, multiprocessing, logging
from pprint import pprint

import pymongo
from pymongo import IndexModel, ASCENDING, DESCENDING

# Custom tool modules
import mongodb  # module for setting up connection with (local) MongoDB database
import multiprocessing_workers  # module for splitting workloads between processes
import utilities  # module for various custom utility functions
from config import * # import all global configuration variables

In [2]:
# read in method_1, method_2 results
m1_ibm_user_ids_lst = []
m2_ibm_user_ids_lst = []

if not m1_ibm_user_ids_lst:
    with open(M1_IBM_USER_IDS_PKL, 'rb') as f:
        m1_ibm_user_ids_lst = pickle.load(f)

if not m2_ibm_user_ids_lst:
    with open(M2_IBM_USER_IDS_PKL, 'rb') as f:
        m2_ibm_user_ids_lst = pickle.load(f)

In [3]:
# query only retweet tweets
match_dict = {'$match': {'quoted_status': {'$exists': True}}}

limit_dict = {'$limit': 100}

project_dict = {'$project': {'_id': 0,
                             'id': 1,
                             'retweet_count': 1,
                             'quoted_status.retweet_count': 1}}

# group_dict = {'$group': {'_id': '$user.id', # user's id
#                          'retweet_tweets_num': {'$sum': 1}, # total number of retweet tweets belongs to the user
#                          'retweet_tweets_total_retweet_count': {'$sum': '$retweet_count'} # sum of retweet counts of all retweet tweets belong to the user
#                          }}
# project_dict = {'$project': {'_id': 0,
#                             'user_id': '$_id',
#                             'retweet_tweets_num': 1,
#                             'retweet_tweets_total_retweet_count': 1}}

ppl_lst = [match_dict, project_dict, limit_dict]

updated_col = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COL)
cursor = updated_col.aggregate(pipeline=ppl_lst,
                               allowDiskUse=True) # Exceeded memory limit for $group, but didn't allow external sort. Pass allowDiskUse:true to opt in.
for doc in cursor:
    print(doc)

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
{'id': 838632147929923584, 'retweet_count': 0, 'quoted_status': {'retweet_count': 56}}
{'id': 838632255127891968, 'retweet_count': 0, 'quoted_status': {'retweet_count': 12}}
{'id': 838632334823985154, 'retweet_count': 0, 'quoted_status': {'retweet_count': 0}}
{'id': 838632128946503680, 'retweet_count': 3, 'quoted_status': {'retweet_count': 20}}
{'id': 838632575518277633, 'retweet_count': 1, 'quoted_status': {'retweet_count': 168}}
{'id': 838632878292492288, 'retweet_count': 26, 'quoted_status': {'retweet_count': 113}}
{'id': 838632847166550016, 'retweet_count': 0, 'quoted_status': {'retweet_count': 0}}
{'id': 838633114448592896, 'retweet_count': 0, 'quoted_status': {'retweet_count': 1}}
{'id': 838633132370890752, 'retweet_count': 0, 'quoted_status': {'retweet_count': 18}}
{'id': 838633585733165056, 'retweet_count': 0, 'quoted_status': {'retweet_count': 2}}
{'id': 838633769766510592, 'retweet_count': 0, 'quoted_status': {'r

In [2]:
# check how many tweets are 'retweet tweets'
updated_col = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COL)

total = updated_col.count() # total number of tweets in database
count = updated_col.count(filter={'retweeted_status': {'$exists': True}})
print('{} ({:.2%}) tweets are "retweet tweets"'.format(count, (count / total)))

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
2465388 (48.88%) tweets are "retweet tweets"


In [4]:
# check how many 'retweet tweets' unique
updated_col = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COL)

unique_retweet_tweet_id_set = set()

cursor = updated_col.find(filter={'retweeted_status': {'$exists': True}},
                          projection={'_id': 0, 'retweeted_status.id': 1})

for doc in cursor:
    unique_retweet_tweet_id = doc['retweeted_status']['id']
    unique_retweet_tweet_id_set.add(unique_retweet_tweet_id)
print('{} unique "retweet tweets"'.format(len(unique_retweet_tweet_id_set)))

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
555635 unique "retweet tweets"


In [5]:
# double check how many 'retweet tweets' unique
updated_col = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COL)

unique_retweet_tweet_id_lst = []

unique_retweet_tweet_id_lst = updated_col.distinct('retweeted_status.id',
                                                   filter={'retweeted_status': {'$exists': True}})

print('{} unique "retweet tweets"'.format(len(unique_retweet_tweet_id_lst)))

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
555635 unique "retweet tweets"


In [6]:
# check how many authors of 'retweet tweets'
updated_col = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COL)

retweet_tweet_user_id_set = set()

cursor = updated_col.find(filter={'retweeted_status': {'$exists': True}},
                          projection={'_id': 0, 'retweeted_status.user.id': 1})

for doc in cursor:
    retweet_tweet_user_id = doc['retweeted_status']['user']['id']
    retweet_tweet_user_id_set.add(retweet_tweet_user_id)
print('{} authors of "retweet tweets"'.format(len(retweet_tweet_user_id_set)))

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
143949 authors of "retweet tweets"


In [7]:
# double check how many 'retweet tweets' unique
updated_col = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COL)

retweet_tweet_user_id_lst = []

retweet_tweet_user_id_lst = updated_col.distinct('retweeted_status.user.id',
                                                 filter={'retweeted_status': {'$exists': True}})

print('{} unique "retweet tweets"'.format(len(retweet_tweet_user_id_lst)))

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
143949 unique "retweet tweets"


In [3]:
"""
Check the correctness of update 'retweet_count' field manually in tweets_ek-2:tw_nt collection by comparing its distribution against
the tweets_ek:c2 collection
"""

import pandas as pd
import mongodb


'''
Extract all the "retweet_count" of native tweets in tweets_ek:c2 collection
'''
old_retweet_count_lst = []
print('Building old retweet_count list...')

tweets_ek_c2 = mongodb.initialize(db_name='tweets_ek', collection_name='c2')

cursor = tweets_ek_c2.find(filter={'retweeted_status': {'$exists': False}}, # native tweets only
                           projection={'_id': 0, 'retweet_count': 1})

for doc in cursor:
    retweet_count = int(doc['retweet_count'])
    old_retweet_count_lst.append(retweet_count)
print('List length: {}'.format(len(old_retweet_count_lst)))

'''
Extract all the "retweet_count" of native tweets in tweets_ek-2:tw_nt
'''
new_retweet_count_lst = []
print('Building new retweet_count list...')

tweets_ek_2_tw_nt = mongodb.initialize(db_name='tweets_ek-2', collection_name='tw_nt')

cursor = tweets_ek_2_tw_nt.find(projection={'_id': 0, 'retweet_count': 1}) # all tweets in this collection are native tweets
 
for doc in cursor:
    retweet_count = int(doc['retweet_count'])
    new_retweet_count_lst.append(retweet_count)
print('List length: {}'.format(len(new_retweet_count_lst)))

s1 = pd.Series(data=old_retweet_count_lst)
s2 = pd.Series(data=new_retweet_count_lst)

Building old retweet_count list...
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
List length: 2578199
Building new retweet_count list...
MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!
List length: 5812824


In [4]:
s1.describe()

count    2.578199e+06
mean     9.551819e-01
std      2.017118e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.749600e+04
dtype: float64

In [5]:
s2.describe()

count    5.812824e+06
mean     9.637095e-01
std      1.541557e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.845800e+04
dtype: float64

In [6]:
tmp_col = mongodb.initialize(DB_NAME, USER_NT_COL)
total = tmp_col.count()
doc = tmp_col.count(filter={'description': {'$ne': None}})
print(total)
print(doc)

MongoDB on localhost:27017/tweets_ek-2.user_nt connected successfully!
609799
527938


In [3]:
'''
Read in simple_influence pickle
'''
df = pd.read_pickle(SIMPLE_INFLUENCE_PKL)

# clean and preparations
df = df.fillna(0)

df = df.replace(to_replace=-1, value=0) # one data error with 'fo' = -1

'''
Load lists of IBM users
'''
user_nt_ibm_desc_ids_lst = []
with open(USER_NT_IBM_DESC_IDS_LST_PKL, 'rb') as f:
    user_nt_ibm_desc_ids_lst = pickle.load(f)

user_nt_ibm_desc_ids_set = set(user_nt_ibm_desc_ids_lst)
user_nt_ibm_desc_cond = df['uid'].isin(user_nt_ibm_desc_ids_set)
df_user_nt_ibm_desc = df[user_nt_ibm_desc_cond]

In [4]:
df_user_nt_ibm_desc.describe(np.arange(0.1, 1, 0.1))

Unnamed: 0,uid,fo,n_n,n_src,q_n,q_src,p_n,p_src,nr_n,nr_src
count,6271.0,6271.0,6271.0,6271.0,6271.0,6271.0,6271.0,6271.0,6271.0,6271.0
mean,5.613556e+16,1575.794291,12.924573,19.817573,1.156594,1.162972,0.234093,0.373784,11.566417,18.290384
std,2.025977e+17,12024.192435,44.55551,243.465643,7.475342,15.70417,6.039138,16.889678,42.561823,240.657298
min,11426.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10%,18724360.0,37.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
20%,42178930.0,80.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
30.0%,103631400.0,133.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
40%,237353600.0,197.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
50%,389068300.0,280.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0
60%,612407700.0,399.0,4.0,2.0,0.0,0.0,0.0,0.0,3.0,2.0


In [18]:
df_user_nt_ibm_desc[df_user_nt_ibm_desc['fo'] > 5000].sort_values(by='fo', ascending=False)

Unnamed: 0,uid,fo,n_n,n_src,q_n,q_src,p_n,p_src,nr_n,nr_src
109424,47336979,576741,1,9,0.0,0.0,0.0,0.0,1.0,9.0
45344,17880018,440270,1,4,1.0,4.0,0.0,0.0,0.0,0.0
51822,18994444,347515,149,8474,3.0,23.0,15.0,5.0,133.0,8447.0
83117,29735775,164355,305,13985,1.0,31.0,1.0,0.0,303.0,13954.0
36865,16528347,152753,101,439,0.0,0.0,0.0,0.0,101.0,439.0
239675,267283568,147126,347,4501,5.0,26.0,2.0,14.0,340.0,4461.0
233799,253663760,135541,27,87,3.0,8.0,1.0,0.0,23.0,79.0
284383,408898240,131881,68,1438,30.0,558.0,34.0,625.0,11.0,284.0
35482,16319797,120759,57,1085,3.0,47.0,3.0,5.0,51.0,1033.0
211690,201846344,111972,127,2875,5.0,63.0,1.0,0.0,121.0,2812.0


In [5]:
df_user_nt_ibm_desc[df_user_nt_ibm_desc['fo'] > 0].sort_values(by='fo', ascending=False)

Unnamed: 0,uid,fo,n_n,n_src,q_n,q_src,p_n,p_src,nr_n,nr_src
109424,47336979,576741,1,9,0.0,0.0,0.0,0.0,1.0,9.0
45344,17880018,440270,1,4,1.0,4.0,0.0,0.0,0.0,0.0
51822,18994444,347515,149,8474,3.0,23.0,15.0,5.0,133.0,8447.0
83117,29735775,164355,305,13985,1.0,31.0,1.0,0.0,303.0,13954.0
36865,16528347,152753,101,439,0.0,0.0,0.0,0.0,101.0,439.0
239675,267283568,147126,347,4501,5.0,26.0,2.0,14.0,340.0,4461.0
233799,253663760,135541,27,87,3.0,8.0,1.0,0.0,23.0,79.0
284383,408898240,131881,68,1438,30.0,558.0,34.0,625.0,11.0,284.0
35482,16319797,120759,57,1085,3.0,47.0,3.0,5.0,51.0,1033.0
211690,201846344,111972,127,2875,5.0,63.0,1.0,0.0,121.0,2812.0


In [8]:
target_user_ids_lst = list(df_user_nt_ibm_desc[df_user_nt_ibm_desc['fo'] > 0]['uid'])

In [9]:
target_user_ids_lst.remove(47336979)
target_user_ids_lst.remove(17880018)

In [12]:
with open(os.path.join('.', 'target_uids.txt'), 'w') as f:
    for target_user_id in target_user_ids_lst:
        f.write(str(target_user_id) + '\n')

In [5]:
user_nt_col = mongodb.initialize(DB_NAME, USER_NT_COL)
# protected account 1: 392814683
# protected account 2: 400175272
doc = user_nt_col.find_one(filter={'id': 400175272})
pprint(doc)

MongoDB on localhost:27017/tweets_ek-2.user_nt connected successfully!
{'_id': ObjectId('59272627fe57a1210e4b524c'),
 'contributors_enabled': False,
 'created_at': 'Fri Oct 28 16:25:19 +0000 2011',
 'default_profile': False,
 'default_profile_image': False,
 'description': 'Views expressed are personal; Software Engineer at IBM; '
                'Previously Graduate student at IIIT Hyderabad in Data Mining, '
                'NLP, Machine Learning',
 'favourites_count': 440,
 'follow_request_sent': None,
 'followers_count': 153,
 'following': None,
 'friends_count': 558,
 'geo_enabled': True,
 'id': 400175272,
 'id_str': '400175272',
 'is_translator': False,
 'lang': 'en',
 'listed_count': 3,
 'location': 'India',
 'name': 'Ayushi Dalmia',
 'notifications': None,
 'profile_background_color': '9AE4E8',
 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme16/bg.gif',
 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme16/bg.gif',
 'profi