In [1]:
# Data analysis modules: pandas, matplotlib, numpy, and etc.
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # render double resolution plot output for Retina screens 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Standard modules, MongoDB modules
import os, sys, json, datetime, pickle, multiprocessing, logging
from pprint import pprint

import pymongo
from pymongo import IndexModel, ASCENDING, DESCENDING

# Custom tool modules
import mongodb  # module for setting up connection with (local) MongoDB database
import multiprocessing_workers  # module for splitting workloads between processes
import utilities  # module for various custom utility functions
from config import * # import all global configuration variables

In [2]:
# read in method_1, method_2 results
m1_ibm_user_ids_lst = []
m2_ibm_user_ids_lst = []

if not m1_ibm_user_ids_lst:
    with open(M1_IBM_USER_IDS_PKL, 'rb') as f:
        m1_ibm_user_ids_lst = pickle.load(f)

if not m2_ibm_user_ids_lst:
    with open(M2_IBM_USER_IDS_PKL, 'rb') as f:
        m2_ibm_user_ids_lst = pickle.load(f)

NameError: name 'M1_IBM_USER_IDS_PKL' is not defined

In [3]:
# query only retweet tweets
match_dict = {'$match': {'quoted_status': {'$exists': True}}}

limit_dict = {'$limit': 100}

project_dict = {'$project': {'_id': 0,
                             'id': 1,
                             'retweet_count': 1,
                             'quoted_status.retweet_count': 1}}

# group_dict = {'$group': {'_id': '$user.id', # user's id
#                          'retweet_tweets_num': {'$sum': 1}, # total number of retweet tweets belongs to the user
#                          'retweet_tweets_total_retweet_count': {'$sum': '$retweet_count'} # sum of retweet counts of all retweet tweets belong to the user
#                          }}
# project_dict = {'$project': {'_id': 0,
#                             'user_id': '$_id',
#                             'retweet_tweets_num': 1,
#                             'retweet_tweets_total_retweet_count': 1}}

ppl_lst = [match_dict, project_dict, limit_dict]

updated_col = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COL)
cursor = updated_col.aggregate(pipeline=ppl_lst,
                               allowDiskUse=True) # Exceeded memory limit for $group, but didn't allow external sort. Pass allowDiskUse:true to opt in.
for doc in cursor:
    print(doc)

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
{'id': 838632147929923584, 'retweet_count': 0, 'quoted_status': {'retweet_count': 56}}
{'id': 838632255127891968, 'retweet_count': 0, 'quoted_status': {'retweet_count': 12}}
{'id': 838632334823985154, 'retweet_count': 0, 'quoted_status': {'retweet_count': 0}}
{'id': 838632128946503680, 'retweet_count': 3, 'quoted_status': {'retweet_count': 20}}
{'id': 838632575518277633, 'retweet_count': 1, 'quoted_status': {'retweet_count': 168}}
{'id': 838632878292492288, 'retweet_count': 26, 'quoted_status': {'retweet_count': 113}}
{'id': 838632847166550016, 'retweet_count': 0, 'quoted_status': {'retweet_count': 0}}
{'id': 838633114448592896, 'retweet_count': 0, 'quoted_status': {'retweet_count': 1}}
{'id': 838633132370890752, 'retweet_count': 0, 'quoted_status': {'retweet_count': 18}}
{'id': 838633585733165056, 'retweet_count': 0, 'quoted_status': {'retweet_count': 2}}
{'id': 838633769766510592, 'retweet_count': 0, 'quoted_status': {'r

In [2]:
# check how many tweets are 'retweet tweets'
updated_col = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COL)

total = updated_col.count() # total number of tweets in database
count = updated_col.count(filter={'retweeted_status': {'$exists': True}})
print('{} ({:.2%}) tweets are "retweet tweets"'.format(count, (count / total)))

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
2465388 (48.88%) tweets are "retweet tweets"


In [4]:
# check how many 'retweet tweets' unique
updated_col = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COL)

unique_retweet_tweet_id_set = set()

cursor = updated_col.find(filter={'retweeted_status': {'$exists': True}},
                          projection={'_id': 0, 'retweeted_status.id': 1})

for doc in cursor:
    unique_retweet_tweet_id = doc['retweeted_status']['id']
    unique_retweet_tweet_id_set.add(unique_retweet_tweet_id)
print('{} unique "retweet tweets"'.format(len(unique_retweet_tweet_id_set)))

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
555635 unique "retweet tweets"


In [5]:
# double check how many 'retweet tweets' unique
updated_col = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COL)

unique_retweet_tweet_id_lst = []

unique_retweet_tweet_id_lst = updated_col.distinct('retweeted_status.id',
                                                   filter={'retweeted_status': {'$exists': True}})

print('{} unique "retweet tweets"'.format(len(unique_retweet_tweet_id_lst)))

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
555635 unique "retweet tweets"


In [6]:
# check how many authors of 'retweet tweets'
updated_col = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COL)

retweet_tweet_user_id_set = set()

cursor = updated_col.find(filter={'retweeted_status': {'$exists': True}},
                          projection={'_id': 0, 'retweeted_status.user.id': 1})

for doc in cursor:
    retweet_tweet_user_id = doc['retweeted_status']['user']['id']
    retweet_tweet_user_id_set.add(retweet_tweet_user_id)
print('{} authors of "retweet tweets"'.format(len(retweet_tweet_user_id_set)))

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
143949 authors of "retweet tweets"


In [7]:
# double check how many 'retweet tweets' unique
updated_col = mongodb.initialize(db_name=DB_NAME, collection_name=UPDATED_COL)

retweet_tweet_user_id_lst = []

retweet_tweet_user_id_lst = updated_col.distinct('retweeted_status.user.id',
                                                 filter={'retweeted_status': {'$exists': True}})

print('{} unique "retweet tweets"'.format(len(retweet_tweet_user_id_lst)))

MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
143949 unique "retweet tweets"


In [3]:
"""
Check the correctness of update 'retweet_count' field manually in tweets_ek-2:tw_nt collection by comparing its distribution against
the tweets_ek:c2 collection
"""

import pandas as pd
import mongodb


'''
Extract all the "retweet_count" of native tweets in tweets_ek:c2 collection
'''
old_retweet_count_lst = []
print('Building old retweet_count list...')

tweets_ek_c2 = mongodb.initialize(db_name='tweets_ek', collection_name='c2')

cursor = tweets_ek_c2.find(filter={'retweeted_status': {'$exists': False}}, # native tweets only
                           projection={'_id': 0, 'retweet_count': 1})

for doc in cursor:
    retweet_count = int(doc['retweet_count'])
    old_retweet_count_lst.append(retweet_count)
print('List length: {}'.format(len(old_retweet_count_lst)))

'''
Extract all the "retweet_count" of native tweets in tweets_ek-2:tw_nt
'''
new_retweet_count_lst = []
print('Building new retweet_count list...')

tweets_ek_2_tw_nt = mongodb.initialize(db_name='tweets_ek-2', collection_name='tw_nt')

cursor = tweets_ek_2_tw_nt.find(projection={'_id': 0, 'retweet_count': 1}) # all tweets in this collection are native tweets
 
for doc in cursor:
    retweet_count = int(doc['retweet_count'])
    new_retweet_count_lst.append(retweet_count)
print('List length: {}'.format(len(new_retweet_count_lst)))

s1 = pd.Series(data=old_retweet_count_lst)
s2 = pd.Series(data=new_retweet_count_lst)

Building old retweet_count list...
MongoDB on localhost:27017/tweets_ek.c2 connected successfully!
List length: 2578199
Building new retweet_count list...
MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!
List length: 5812824


In [4]:
s1.describe()

count    2.578199e+06
mean     9.551819e-01
std      2.017118e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.749600e+04
dtype: float64

In [5]:
s2.describe()

count    5.812824e+06
mean     9.637095e-01
std      1.541557e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.845800e+04
dtype: float64

In [6]:
tmp_col = mongodb.initialize(DB_NAME, USER_NT_COL)
total = tmp_col.count()
doc = tmp_col.count(filter={'description': {'$ne': None}})
print(total)
print(doc)

MongoDB on localhost:27017/tweets_ek-2.user_nt connected successfully!
609799
527938


In [2]:
'''
Read in simple_influence pickle
'''
df = pd.read_pickle(SIMPLE_INFLUENCE_PKL)

# clean and preparations
df = df.fillna(0)

df = df.replace(to_replace=-1, value=0) # one data error with 'fo' = -1

'''
Load lists of IBM users
'''
user_nt_ibm_desc_ids_lst = []
with open(USER_NT_IBM_DESC_IDS_LST_PKL, 'rb') as f:
    user_nt_ibm_desc_ids_lst = pickle.load(f)

user_nt_ibm_desc_ids_set = set(user_nt_ibm_desc_ids_lst)
user_nt_ibm_desc_cond = df['uid'].isin(user_nt_ibm_desc_ids_set)
df_user_nt_ibm_desc = df[user_nt_ibm_desc_cond]

In [3]:
df_user_nt_ibm_desc.describe()

Unnamed: 0,uid,fo,n_n,n_src,q_n,q_src,p_n,p_src,nr_n,nr_src
count,6271.0,6271.0,6271.0,6271.0,6271.0,6271.0,6271.0,6271.0,6271.0,6271.0
mean,5.613556e+16,1575.794291,12.924573,19.817573,1.156594,1.162972,0.234093,0.373784,11.566417,18.290384
std,2.025977e+17,12024.192435,44.55551,243.465643,7.475342,15.70417,6.039138,16.889678,42.561823,240.657298
min,11426.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,65575860.0,107.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,389068300.0,280.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0
75%,2351117000.0,780.5,8.0,5.0,1.0,0.0,0.0,0.0,7.0,4.0
max,8.65201e+17,576741.0,1330.0,13985.0,435.0,1034.0,453.0,1179.0,1330.0,13954.0


In [18]:
df_user_nt_ibm_desc[df_user_nt_ibm_desc['fo'] > 5000].sort_values(by='fo', ascending=False)

Unnamed: 0,uid,fo,n_n,n_src,q_n,q_src,p_n,p_src,nr_n,nr_src
109424,47336979,576741,1,9,0.0,0.0,0.0,0.0,1.0,9.0
45344,17880018,440270,1,4,1.0,4.0,0.0,0.0,0.0,0.0
51822,18994444,347515,149,8474,3.0,23.0,15.0,5.0,133.0,8447.0
83117,29735775,164355,305,13985,1.0,31.0,1.0,0.0,303.0,13954.0
36865,16528347,152753,101,439,0.0,0.0,0.0,0.0,101.0,439.0
239675,267283568,147126,347,4501,5.0,26.0,2.0,14.0,340.0,4461.0
233799,253663760,135541,27,87,3.0,8.0,1.0,0.0,23.0,79.0
284383,408898240,131881,68,1438,30.0,558.0,34.0,625.0,11.0,284.0
35482,16319797,120759,57,1085,3.0,47.0,3.0,5.0,51.0,1033.0
211690,201846344,111972,127,2875,5.0,63.0,1.0,0.0,121.0,2812.0


In [5]:
df_user_nt_ibm_desc[df_user_nt_ibm_desc['fo'] > 0].sort_values(by='fo', ascending=False)

Unnamed: 0,uid,fo,n_n,n_src,q_n,q_src,p_n,p_src,nr_n,nr_src
109424,47336979,576741,1,9,0.0,0.0,0.0,0.0,1.0,9.0
45344,17880018,440270,1,4,1.0,4.0,0.0,0.0,0.0,0.0
51822,18994444,347515,149,8474,3.0,23.0,15.0,5.0,133.0,8447.0
83117,29735775,164355,305,13985,1.0,31.0,1.0,0.0,303.0,13954.0
36865,16528347,152753,101,439,0.0,0.0,0.0,0.0,101.0,439.0
239675,267283568,147126,347,4501,5.0,26.0,2.0,14.0,340.0,4461.0
233799,253663760,135541,27,87,3.0,8.0,1.0,0.0,23.0,79.0
284383,408898240,131881,68,1438,30.0,558.0,34.0,625.0,11.0,284.0
35482,16319797,120759,57,1085,3.0,47.0,3.0,5.0,51.0,1033.0
211690,201846344,111972,127,2875,5.0,63.0,1.0,0.0,121.0,2812.0


In [8]:
target_user_ids_lst = list(df_user_nt_ibm_desc[df_user_nt_ibm_desc['fo'] > 0]['uid'])

In [9]:
target_user_ids_lst.remove(47336979)
target_user_ids_lst.remove(17880018)

In [12]:
with open(os.path.join('.', 'target_uids.txt'), 'w') as f:
    for target_user_id in target_user_ids_lst:
        f.write(str(target_user_id) + '\n')

In [5]:
user_nt_col = mongodb.initialize(DB_NAME, USER_NT_COL)
# protected account 1: 392814683
# protected account 2: 400175272
doc = user_nt_col.find_one(filter={'id': 400175272})
pprint(doc)

MongoDB on localhost:27017/tweets_ek-2.user_nt connected successfully!
{'_id': ObjectId('59272627fe57a1210e4b524c'),
 'contributors_enabled': False,
 'created_at': 'Fri Oct 28 16:25:19 +0000 2011',
 'default_profile': False,
 'default_profile_image': False,
 'description': 'Views expressed are personal; Software Engineer at IBM; '
                'Previously Graduate student at IIIT Hyderabad in Data Mining, '
                'NLP, Machine Learning',
 'favourites_count': 440,
 'follow_request_sent': None,
 'followers_count': 153,
 'following': None,
 'friends_count': 558,
 'geo_enabled': True,
 'id': 400175272,
 'id_str': '400175272',
 'is_translator': False,
 'lang': 'en',
 'listed_count': 3,
 'location': 'India',
 'name': 'Ayushi Dalmia',
 'notifications': None,
 'profile_background_color': '9AE4E8',
 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme16/bg.gif',
 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme16/bg.gif',
 'profi

In [13]:
"""
Move 500 queried uid from dir1 to dir2
"""
import os, glob, shutil

if 1 == 1:
    '''
    Followers information of M1 IBM users queried from Twitter API are stored in 2 separate folders
    '''
    followers_uids_dir1 = '/home/dwang8/Documents/workspace/ibm_tweets_projecct-collector-4/follower_ids'
    followers_objs_dir1 = '/home/dwang8/Documents/workspace/ibm_tweets_projecct-collector-4/follower_objs'
    
    followers_uids_dir2 = '/home/dwang8/Documents/workspace-2/ibm_tweets_projecct-collector-4/follower_ids'
    followers_objs_dir2 = '/home/dwang8/Documents/workspace-2/ibm_tweets_projecct-collector-4/follower_objs'
    
    '''
    Read in list of queried uids from dir1
    '''
    queried_uids_lst = []
    for txt_path in sorted(glob.glob(os.path.join(followers_uids_dir1, '*.txt'))):
        txt_name = txt_path.split('/')[-1]
        queried_uid = int(txt_name.split('.')[0])
        queried_uids_lst.append(queried_uid)
    
    '''
    Remove already hydrated user ids from list of queried user ids
    '''
    cleaned_queried_uids = queried_uids_lst
    removed_count = 0
    for json_path in sorted(glob.glob(os.path.join(followers_objs_dir1, '*.json'))):
        json_name = json_path.split('/')[-1]
        hydrated_uid = int(json_name.split('.')[0])
        '''
        Avoid ValueError if user id of file name not in cleaned_queried_uids
        '''
        if hydrated_uid in cleaned_queried_uids:
            cleaned_queried_uids.remove(hydrated_uid)
            removed_count += 1
    print('Clear {} queried user ids from list'.format(removed_count))
    print('Remain un-hydrated user ids: {}'.format(len(cleaned_queried_uids)))
    
#     '''
#     Copy 500 un-hydrated user ids from dir1 to dir2
#     '''
#     move_queried_uids = cleaned_queried_uids[-500:]
#     print('Move un-hydrated user ids from dir1 to dir2: {}'.format(len(move_queried_uids)))
#     for move_queried_uid in move_queried_uids:
#         print('Handling {}'.format(move_queried_uid))
#         '''
#         Copy follower_ids file from dir1 to dir2
#         '''
#         source_file = os.path.join(followers_uids_dir1, '{}.txt'.format(move_queried_uid))
#         target_file = os.path.join(followers_uids_dir2, '{}.txt'.format(move_queried_uid))
#         shutil.copyfile(source_file, target_file)
        
#         '''
#         Rename follower_ids file in dir1
#         '''
#         os.rename(source_file, os.path.join(followers_uids_dir1, '{}.txt.moved'.format(move_queried_uid)))
#     print('Done')

Clear 2109 queried user ids from list
Remain un-hydrated user ids: 440


In [6]:
if 1 == 1:
    col = mongodb.initialize(db_name=DB_NAME, collection_name=TW_RT_IBM_TW_COL)
    lst = col.distinct('user.id')
    print('{}'.format(len(lst)))
    
    with open(USER_NT_IBM_DESC_IDS_LST_PKL, 'rb') as f:
        m1_ibm_ids_lst = pickle.load(f)
    print('m1 ibm users: {}'.format(len(m1_ibm_ids_lst)))
    
    inter = set(lst).intersection(set(m1_ibm_ids_lst))
    print(len(inter))

MongoDB on localhost:27017/tweets_ek-2.tw_rt_ibm_tw connected successfully!
36028
m1 ibm users: 6271
3181


In [6]:
if 1 == 1:
    df = pd.read_pickle(SIMPLE_INFLUENCE_PKL)
    df = df.fillna(0)
    
    '''
    Load lists of IBM users
    '''
    user_nt_ibm_desc_ids_lst = []
    with open(USER_NT_IBM_DESC_IDS_LST_PKL, 'rb') as f:
        user_nt_ibm_desc_ids_lst = pickle.load(f)

    user_nt_ibm_tw_prop_2_ids_lst = []
    with open(USER_NT_IBM_TW_PROP_2_IDS_LST_PKL, 'rb') as f:
        user_nt_ibm_tw_prop_2_ids_lst = pickle.load(f)     
    
    user_nt_ibm_desc_ids_set = set(user_nt_ibm_desc_ids_lst)
    user_nt_ibm_tw_prop_2_ids_set = set(user_nt_ibm_tw_prop_2_ids_lst)
    
    '''
    build dfs
    '''
    user_nt_ibm_desc_cond = df['uid'].isin(user_nt_ibm_desc_ids_set)
    df_user_nt_ibm_desc = df[user_nt_ibm_desc_cond]
    df_user_nt_nonibm_desc = df[~ user_nt_ibm_desc_cond]
    
    user_nt_ibm_tw_prop_2_cond = df['uid'].isin(user_nt_ibm_tw_prop_2_ids_set)
    df_user_nt_ibm_tw_prop_2 = df[user_nt_ibm_tw_prop_2_cond]
    df_user_nt_nonibm_tw_prop_2 = df[~ user_nt_ibm_tw_prop_2_cond]

    print('Done')

Done


In [7]:
if 1 == 1:
    """
    Simple influence of M1
    """
    m1_ibm_norm_s = df_user_nt_ibm_desc['fo'] * df_user_nt_ibm_desc['n_n']
    m1_ibm_norm_s = m1_ibm_norm_s.replace(to_replace=0, value=1) # clean 0 values since its denominator
    m1_ibm_simple_influence = df_user_nt_ibm_desc['n_src'] / m1_ibm_norm_s
    df_user_nt_ibm_desc = df_user_nt_ibm_desc.assign(simple_inf=m1_ibm_simple_influence)
    
    """
    Compound influence of M1
    """
    m1_ibm_arc = df_user_nt_ibm_desc['n_src'] / df_user_nt_ibm_desc['n_n']
    m1_ibm_cleaned_fo = df_user_nt_ibm_desc['fo'].replace(to_replace=0, value=1)
    m1_ibm_compound_influence = m1_ibm_arc * np.log(m1_ibm_cleaned_fo)
    df_user_nt_ibm_desc = df_user_nt_ibm_desc.assign(compound_inf=m1_ibm_compound_influence)
    
    """
    Compute average retweet count for each M1 IBM user
    """
    m1_ibm_avg_rt = df_user_nt_ibm_desc['n_src'] / df_user_nt_ibm_desc['n_n']
    df_user_nt_ibm_desc = df_user_nt_ibm_desc.assign(avg_rt=m1_ibm_avg_rt)
    
    """
    Simple influence of M2
    """
    m2_ibm_norm_s = df_user_nt_ibm_tw_prop_2['fo'] * df_user_nt_ibm_tw_prop_2['n_n']
    m2_ibm_norm_s = m2_ibm_norm_s.replace(to_replace=0, value=1) # clean 0 values since its denominator
    m2_ibm_simple_influence = df_user_nt_ibm_tw_prop_2['n_src'] / m2_ibm_norm_s
    df_user_nt_ibm_tw_prop_2 = df_user_nt_ibm_tw_prop_2.assign(simple_inf=m2_ibm_simple_influence)
    
    """
    Compound influence of M2
    """
    m2_ibm_arc = df_user_nt_ibm_tw_prop_2['n_src'] / df_user_nt_ibm_tw_prop_2['n_n']
    m2_ibm_cleaned_fo = df_user_nt_ibm_tw_prop_2['fo'].replace(to_replace=0, value=1)
    m2_ibm_compound_influence = m2_ibm_arc * np.log(m2_ibm_cleaned_fo)
    df_user_nt_ibm_tw_prop_2 = df_user_nt_ibm_tw_prop_2.assign(compound_inf=m2_ibm_compound_influence)
    
    """
    Compute average retweet count for each M2 IBM user
    """
    m2_ibm_avg_rt = df_user_nt_ibm_tw_prop_2['n_src'] / df_user_nt_ibm_tw_prop_2['n_n']
    df_user_nt_ibm_tw_prop_2 = df_user_nt_ibm_tw_prop_2.assign(avg_rt=m2_ibm_avg_rt)
    
    print('Done')

Done


In [8]:
"""
M1 simple inf top incluencers
"""
if 1 == 1:
    df_m1_simple_inf_top = df_user_nt_ibm_desc[df_user_nt_ibm_desc['fo'] > 0]
    df_m1_simple_inf_top = df_m1_simple_inf_top[['uid', 'fo', 'n_n', 'n_src', 'simple_inf']]

In [9]:
df_m1_simple_inf_top.sort_values(by='simple_inf', ascending=False)

Unnamed: 0,uid,fo,n_n,n_src,simple_inf
557692,795512917831524352,1,1,2,2.000000
608895,862358912921731074,1,2,3,1.500000
533216,744606437092036609,3,1,4,1.333333
579306,830014520324128769,3,4,8,0.666667
595581,847209334136299521,2,3,3,0.500000
296637,468611376,4,1,2,0.500000
547516,776052044578848768,2,1,1,0.500000
525795,728731567800094721,12,6,32,0.444444
506274,4780720626,6,2,4,0.333333
497036,4157525007,11,4,14,0.318182


In [6]:
"""
M2 simple inf top incluencers
"""
if 1 == 1:
    df_m2_simple_inf_top = df_user_nt_ibm_tw_prop_2[df_user_nt_ibm_tw_prop_2['fo'] > 0]
    df_m2_simple_inf_top = df_m2_simple_inf_top[['uid', 'fo', 'n_n', 'n_src', 'simple_inf']]

In [7]:
df_m2_simple_inf_top.sort_values(by='simple_inf', ascending=False)

Unnamed: 0,uid,fo,n_n,n_src,simple_inf
590821,842444159332028416,1,20,60,3.000000
607366,859154934159245313,1,7,12,1.714286
492995,3928442897,4,13,77,1.480769
608612,861618662184189952,2,5,14,1.400000
565006,808846575522807808,4,38,154,1.013158
594474,846124178805850112,1,15,15,1.000000
416308,2450374874,1,7,7,1.000000
552726,786121114950901760,16,8,106,0.828125
579306,830014520324128769,3,4,8,0.666667
608638,861668884184694784,2,3,4,0.666667


In [9]:
"""
M1 compound inf top incluencers
"""
if 1 == 1:
    df_m1_compound_inf_top = df_user_nt_ibm_desc[df_user_nt_ibm_desc['fo'] > 0]
    df_m1_compound_inf_top = df_m1_compound_inf_top[['uid', 'fo', 'n_n', 'n_src', 'simple_inf', 'compound_inf', 'avg_rt']]

In [10]:
df_m1_compound_inf_top.sort_values(by='compound_inf', ascending=False)

Unnamed: 0,uid,fo,n_n,n_src,simple_inf,compound_inf,avg_rt
475,51063,5331,11,1034,0.017633,806.641647,94.000000
51822,18994444,347515,149,8474,0.000164,725.611166,56.872483
21071,14474054,999,1,81,0.081081,559.447137,81.000000
83117,29735775,164355,305,13985,0.000279,550.678129,45.852459
48723,18431747,3585,1,54,0.015063,441.963743,54.000000
318629,587488658,3451,1,37,0.010722,301.417515,37.000000
13705,11957802,11345,2,57,0.002512,266.091173,28.500000
211690,201846344,111972,127,2875,0.000202,263.187101,22.637795
284383,408898240,131881,68,1438,0.000160,249.316534,21.147059
18244,14216447,728,1,35,0.048077,230.660537,35.000000


In [12]:
"""
M2 compound inf top incluencers
"""
if 1 == 1:
    df_m2_compound_inf_top = df_user_nt_ibm_tw_prop_2[df_user_nt_ibm_tw_prop_2['fo'] > 0]
    df_m2_compound_inf_top = df_m2_compound_inf_top[['uid', 'fo', 'n_n', 'n_src', 'simple_inf', 'compound_inf', 'avg_rt']]

In [13]:
df_m2_compound_inf_top.sort_values(by='compound_inf', ascending=False)

Unnamed: 0,uid,fo,n_n,n_src,simple_inf,compound_inf,avg_rt
12948,11348282,22401049,5,2095,0.000019,7091.415087,419.000000
4122,3857121,10301,4,1930,0.046840,4458.298194,482.500000
158771,101584084,239490,27,4254,0.000658,1951.525169,157.555556
63214,21558596,126608,79,9940,0.000994,1478.273148,125.822785
507988,4833878343,565,5,1078,0.381593,1366.219628,215.600000
34364,16145086,215555,181,19222,0.000493,1304.225590,106.198895
66074,22330739,592562,5,482,0.000163,1281.369120,96.400000
28711,15357143,25574,34,4109,0.004726,1226.576561,120.852941
233976,254107028,18334,11,1071,0.005311,955.771357,97.363636
543562,767960339547774976,25601,3,282,0.003672,954.136349,94.000000


In [5]:
"""
M1 simple vs compound inf 
"""
if 1 == 1:
    df_m1_simple_compound_inf = df_user_nt_ibm_desc[df_user_nt_ibm_desc['fo'] > 0]
    df_m1_simple_compound_inf = df_m1_simple_compound_inf[['uid', 'fo', 'n_n', 'n_src', 'simple_inf', 'compound_inf', 'avg_rt']]

NameError: name 'df_user_nt_ibm_desc' is not defined

In [15]:
df_m1_simple_compound_inf.sort_values(by='avg_rt', ascending=False)

Unnamed: 0,uid,fo,n_n,n_src,simple_inf,compound_inf,avg_rt
475,51063,5331,11,1034,0.017633,806.641647,94.000000
21071,14474054,999,1,81,0.081081,559.447137,81.000000
51822,18994444,347515,149,8474,0.000164,725.611166,56.872483
48723,18431747,3585,1,54,0.015063,441.963743,54.000000
83117,29735775,164355,305,13985,0.000279,550.678129,45.852459
318629,587488658,3451,1,37,0.010722,301.417515,37.000000
18244,14216447,728,1,35,0.048077,230.660537,35.000000
13705,11957802,11345,2,57,0.002512,266.091173,28.500000
71697,24227230,11518,9,217,0.002093,225.479065,24.111111
57560,20144015,276,1,23,0.083333,129.269220,23.000000


In [3]:
col = mongodb.initialize(DB_NAME, TW_NT_COL)
# uid_int = 18994444
uid_int = 1899444
count = col.count(filter={'user.id': uid_int})
print('Num of native tweets: {}'.format(count))

MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!
Num of native tweets: 0


In [6]:
"""
Count quote tweets posted by M1 IBM users
"""
if 1 == 1:
    '''
    Load in M1 IBM users
    '''
    user_nt_ibm_desc_ids_lst = []
    with open(USER_NT_IBM_DESC_IDS_LST_PKL, 'rb') as f:
        user_nt_ibm_desc_ids_lst = pickle.load(f)
    
    '''
    Count frequency of quote tweets authored by M1 IBM users
    '''
    lst_arg = np.array(user_nt_ibm_desc_ids_lst, dtype=np.int64).tolist() # IMPORTANT force int64 type
    
    tw_nt_qt_col = mongodb.initialize(DB_NAME, TW_NT_QT_COL)
    
    #match_dict = {'$match': {'user.id': {'$in': lst_arg}}} # select out retweets authored by IBM users
    obj = tw_nt_qt_col.count(filter={'user.id': {'$in': lst_arg}})
    print(obj)

MongoDB on localhost:27017/tweets_ek-2.tw_nt_qt connected successfully!
7253


In [1]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import re

In [2]:
%%time
"""
Test sentiment score by using TextBlob library
"""

test_str = "6 @Google #ML use cases (and more coming)\n#GoogleNext17 #GoogleNext17AR https://t.co/d1yUtbYcTF"
test_str2 = "@antoniopironti anything you like? https://t.co/40HuYrnG1b"

def clean_tweet(tweet):
    '''
    Utility function to clean tweet text by removing links, special characters
    using simple regex statements.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
    
lst_str = ['RT @JohnGGalt: Amazing—after years of attacking Donald Trump the media managed to turn #InaugurationDay into all about themselves.',
          'RT @vooda1: CNN Declines to Air White House Press Conference Live YES! ',
          'Another #DearDonaldTrump Letter worth a read @AJEnglish ',
          'RT @RVAwonk: Trump, Sean Spicer, et al. lie for a reason. ',
          'RT @KomptonMusic: Me: I hate corn ',
          'Me: https://t.co/GPgy8R8HB5',
          "6 @Google #ML use cases (and more coming)\n#GoogleNext17 #GoogleNext17AR https://t.co/d1yUtbYcTF"]

#print(clean_tweet(test_str))
for sentence in lst_str:
    stm = TextBlob(sentence, analyzer=NaiveBayesAnalyzer()).sentiment
    print(stm)

Sentiment(classification='pos', p_pos=0.653716605984609, p_neg=0.346283394015392)
Sentiment(classification='pos', p_pos=0.5052503891079454, p_neg=0.49474961089205277)
Sentiment(classification='neg', p_pos=0.4894199584282709, p_neg=0.5105800415717289)
Sentiment(classification='neg', p_pos=0.14358381483298796, p_neg=0.856416185167013)
Sentiment(classification='neg', p_pos=0.4604190919674041, p_neg=0.5395809080325963)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.6247019743259888, p_neg=0.37529802567401127)
CPU times: user 53.4 s, sys: 1.66 s, total: 55.1 s
Wall time: 57 s
