# Process Tweet Data
* Process raw tweet data collected from tweepy streaming api
* Data is a csv with **status** column encoded as a JSON string and additional columns recording search criteria
* LDA topic modeling via mallet on WWBP server - results in output folder

In [1]:
from collections import Counter
import datetime
import time

import pandas as pd
import math
import numpy as np
import scipy as sp

import re
import os
import sys
import json

pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',200)
pd.set_option('precision',4)

%matplotlib inline

## Program Parameters

In [2]:
# Paths
BASE_PATH = '/Users/arthurpelullo/Desktop/code/CDH/covid/twitter/'
DATA_PATH = BASE_PATH + 'data/'
RAW_DATA_PATH = DATA_PATH + 'raw_data/'
MASTER_DATA_PATH = DATA_PATH + 'master_data/'
SUMMARY_DATA_PATH = DATA_PATH + 'summary_data/'

In [3]:
# Credentials
consumer_key="REDACTED"
consumer_secret="REDACTED"
access_key="REDACTED"
access_secret="REDACTED"

# authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

In [4]:
# Constants
GENERAL_TERMS = ['COVID','COVID19','covid_19','COVIDー19','COVID2019','Covid19us','2019-nCoV','COVD19','CODVID19','Covid-19',
 'nCoV2019','covid2020','Covid19pandemic','covid19out','covidkindness','covidiot','covidiots','Knowcovid','SARSCoV2',
 'corona','coronavirus','coronovirus','coronaviruspandemic','Coronaeffect','coronavirususa','coronavirusoutbreak',
 'coronavirusupdate','Coronapocalypse','coronapocolypse','CoronaOutbreak','the \’rona','the roni','virus','Pandemic',
 'MyPandemicSurvivalPlan','PreventEpidemics','publichealth','flattenthecurve','Quarantine','Quarantinelife',
 'quarantineactivities','Quarentineandchill','QuarantineAndChill','SocialDistancing','socialdistancingnow',
 'selfisolating','HarmReduction','LockDownSA','lockdowneffect','lockdownextension','lockdowndiaries','Stayhome',
 'istayhome','istayathome','Stayathome','StayTheFHome','stayhomestaysafe','StayAtHomeSaveLives','stayhomesavelives',
 'iwillsurvivechallenge','StayAtHomeChallenge','ViewFromMyWindow','TogetherAtHome','Withme','Alonetogether',
 'inthistogether','untiltomorrow','ImDoingFineBecause','Staysafe','see10send10','seeapupsendapup','Safehands',
 'handwashing','Handwashing','washyourhands','workfromhome','Peoplehavetowork','essentialworkers',
 'ThanksHealthHeroes','healthcareheroes','GetMePPE','Mask','facemasks','Pdoh','Sdoh','hiap']
GENERAL_TERMS = [word.lower() for word in GENERAL_TERMS]

FOOD_TERMS = ['TooSmallToFail','SaveAmericanHospitality','SaveRestaurants','RestaurantRecovery',
              'ReliefForRestaurants','RallyForRestaurants','SupportLocalRestaurants','SupportLocal',
              'SupportLocalBusiness','TheGreatAmericanTakeout','CarryOut','OrderIn','CurbSide','CurbSidePickup',
              'DineLocal','StillOpen','WereOpen']
FOOD_TERMS = [word.lower() for word in FOOD_TERMS]

POLITICAL_TERMS = ['trumpownseverydeath','Trumpliedpeopledied','Trumpliesamericansdie','trumpliespeopledie',
                   'Wisconsinpandemicvoting','Trumpgenocide','trumppandemic','gopgenocide']
POLITICAL_TERMS = [word.lower() for word in POLITICAL_TERMS]

STIGMA_TERMS = ['HateIsAVirus','WashTheHate','RacismIsAVirus','IAmNotCOVID19']
STIGMA_TERMS = [word.lower() for word in STIGMA_TERMS]

CONSPIRACY_TERMS = ['filmyourhospital','filmyourhospitals','filmyourhospitalchallenge','emptyhospital','dempanic',
                    'plandemic','5gkills','5gconspiracy']
CONSPIRACY_TERMS = [word.lower() for word in CONSPIRACY_TERMS]

VACCINE_TERMS = ['vaccine','vaccines','vaccine','vaccines','vaccination','vaccinations','moderna','pfizer',
                 'CoronavirusVaccine','antivaxx','antivax','antivaccine','ProSafeVaccine','vaccin','CashingInOnCovid',
                 'MyBodyMyChoice','VaccineSafety','NoVaccines','Vax','NoVaccine','vaccineswork','vaccineinjury',
                 'vaccinefree','vaccineinjuryawareness','vaccinesharm','vaccinescauseadults','vaccineawareness',
                 'vaccinesdontcauseautism','vaccinehoax','DoctorsSpeakUp']
VACCINE_TERMS = [word.lower() for word in VACCINE_TERMS]

SEARCH_TERMS = GENERAL_TERMS+FOOD_TERMS+POLITICAL_TERMS+STIGMA_TERMS+CONSPIRACY_TERMS+VACCINE_TERMS
SEARCH_TERMS_LOWER = [word.lower() for word in SEARCH_TERMS]
SEARCH_STRING = '|'.join(SEARCH_TERMS).lower()

In [5]:
# Column definitions
place_cols = ['place_type','place_country','place_state','place_city','place_name',
              'bb_sw_lon','bb_sw_lat','bb_nw_lon','bb_nw_lat','bb_ne_lon','bb_ne_lat','bb_se_lon','bb_se_lat']
user_cols = ['screen_name','name','description','location','created_at','protected','verified','geo_enabled','default_profile',
'statuses_count','favourites_count','friends_count','followers_count','listed_count',
'url','profile_image_url_https','profile_background_image_url_https']

quote_cols = ['created_at','favorite_count','retweet_count','quote_count','reply_count']
quote_display_cols = ['text','created_at','favorite_count','retweet_count','quote_count','reply_count']

message_cols = ['created_at','favorite_count','retweet_count','quote_count','reply_count']
message_display_cols = ['user_id','place_id','quote_id','coordinates','text',
                        'created_at','favorite_count','retweet_count','quote_count','reply_count']

In [6]:
# Top level data entities (1 message:1 entity, 1 entity:N messages)
place_dict = dict()
user_dict = dict()
quote_dict = dict()
message_dict = dict()
# Message data (1 message:N entities, 1 entity:M messages)
hashtag_data = []
mention_data = []
keyword_data = []
# Aggregate data
hashtag_counter = Counter()

## Class and Function Definitions

### Helper Functions

In [7]:
# Find a tweet with data for the desired key
def get_example(data, query_key, shuffle=False):
    if shuffle:
        data = data.copy().sample(frac=1)
    
    for idx,row in data.iterrows():
        status = json.loads(row['status'])
        entity=None
        
        if type(query_key)==str:
            if query_key in status.keys() and status[query_key] not in [None,[]]:
                return status
        elif type(query_key)==list:
            if query_key[0] in status.keys() and status[query_key[0]] not in [None,[]]:
                entity=status[query_key[0]]
                for key in query_key[1:]:
                    if key in entity.keys() and entity[key] not in [None,[]]:
                        entity = entity[key]
                    else:
                        entity=None
                
        if entity!=None:
            return status
    
    
    print('Could not find example data!')
    return None

In [8]:
def get_values(data,query_key,return_keys=False):
    values = []
    for idx,row in data.iterrows():
        status = json.loads(row['status'])
        
        if type(query_key)==str:
            if query_key in status.keys() and status[query_key]!=None:
                if return_keys:
                    for item in status[query_key].keys():
                        values.append(item)
                else:
                    values.append(status[query_key])
        elif type(query_key)==list:
            if query_key[0] in status.keys() and status[query_key[0]]!=None:
                entity=status[query_key[0]]
                for key in query_key[1:]:
                    if key in entity.keys() and entity[key]!=None:
                        entity = entity[key]
                if type(entity)==list:
                    entity = tuple(entity)
                values.append(entity)
                
    return list(set(values))

In [9]:
def get_dict_items(data):

    atomic_keys = []
    composite_keys = []
    composite_dict = dict()
    second_level = []

    for idx,row in data.iterrows():
        status = json.loads(row['status'])
        keywords = row['keywords'].split(',')

        for key in status.keys():
            if type(status[key])==dict:
                composite_keys.append(key)
                entity = status[key]
                if key not in composite_dict.keys():
                    composite_dict[key] = list(entity.keys())
                else:
                    composite_dict[key] = list(set(composite_dict[key] + list(entity.keys())))
                for item in entity.keys():
                    if type(entity[item])==dict:
                        second_level.append((key,item))
            else:
                atomic_keys.append(key)

    atomic_keys = list(set(atomic_keys))
    composite_keys = list(set(composite_keys))
    second_level = list(set(second_level))
    
    return atomic_keys,composite_keys,composite_dict,second_level

### Processing Functions

In [10]:
def get_user_data(status):
    user_id = status['user']['id_str']
    if user_id not in user_dict.keys():
        # add user
        user_dict[user_id] = [status['user'][key] for key in user_cols]
    return user_id

In [11]:
def get_place_data(status):
    # check for place
    if 'place' in status.keys():
        place_id = status['place']['id']
        if place_id not in place_dict.keys():
            # add place
            place_state,place_city,place_name = '','',''
            place_type = status['place']['place_type']
            place_country = status['place']['country_code']
            if place_type == 'admin':
                place_state = status['place']['name']
            elif place_type == 'city':
                temp = [item.strip(' ') for item in status['place']['full_name'].split(',')]
                place_state = temp[1]
                place_city = temp[0]
            elif place_type == 'neighborhood':
                temp = [item.strip(' ') for item in status['place']['full_name'].split(',')]
                place_city = temp[1]
                place_name = temp[0]
            else:
                place_name = status['place']['full_name']
            # bounding box
            temp = status['place']['bounding_box']['coordinates'][0]
            bb_sw_lon,bb_sw_lat = temp[0][0],temp[0][1]
            bb_nw_lon,bb_nw_lat = temp[1][0],temp[1][1]
            bb_ne_lon,bb_ne_lat = temp[2][0],temp[2][1]
            bb_se_lon,bb_se_lat = temp[3][0],temp[3][1]
            # insert in dictionary
            place_dict[place_id] = [place_type,place_country,place_state,place_city,place_name,
                                    bb_sw_lon,bb_sw_lat,bb_nw_lon,bb_nw_lat,
                                    bb_ne_lon,bb_ne_lat,bb_se_lon,bb_se_lat]
    else:
        place_id = ''
        
    return place_id

In [12]:
def get_quote_data(status):
    # Check for quote tweet
    if 'quoted_status' in status.keys():
        quote = status['quoted_status']
        quote_id = quote['id_str']
        if quote_id not in quote_dict.keys():
            # add quote
            if 'extended_tweet' in quote.keys():
                text = quote['extended_tweet']['full_text']
            else:
                text = quote['text']
            quote_dict[quote_id] = [text] + [quote[key] for key in quote_cols]
            
    else:
        quote_id = ''
    
    return quote_id

In [13]:
def get_message_data(status,user_id,place_id,quote_id):
    message_id = status['id_str']
    if message_id not in message_dict.keys():
        # add message
        if 'extended_tweet' in status.keys():
            text = status['extended_tweet']['full_text']
        else:
            text = status['text']
            
        if status['coordinates'] == None:
            coordinates = np.nan
        else:
            coordinates = status['coordinates']['coordinates']
        message_dict[message_id] = [user_id,place_id,quote_id,coordinates,text] + [status[key] for key in message_cols]
    return message_id

In [14]:
def get_auxiliary_data(status,keywords):
    message_id = status['id_str']
    
    # Handle extended tweets
    if 'extended_tweet' in status.keys():
        hashtags = list(set(item['text'].lower() for item in status['extended_tweet']['entities']['hashtags']))
    else:
        hashtags = list(set(item['text'].lower() for item in status['entities']['hashtags']))
    # Process hashtags
    for item in hashtags:
        hashtag_data.append([message_id,item])
        hashtag_counter[item] += 1
    # process keywords
    for item in keywords:
        keyword_data.append([message_id,item])
    # process mentions
    # TODO
        
    return

In [15]:
def get_status_data(status,keywords):
    # User Data - add error check here and below
    user_id = get_user_data(status)
    
    # Place Data
    place_id = get_place_data(status)
    
    # Quote Data 
    quote_id = get_quote_data(status)
    
    # Message Data
    message_id = get_message_data(status,user_id,place_id,quote_id)
    
    # Auxiliary Message Data 
    get_auxiliary_data(status,keywords)
    
    return

In [16]:
def process_data(data):
    count = 0
    
    # Main loop
    for idx,row in data.iterrows():
        status = json.loads(row['status'])
        keywords = row['keywords'].split(',')
        
        # Process status - add error check here
        get_status_data(status,keywords)
          
    return

## Read Data

In [17]:
# Raw JSON data
data = pd.read_csv(RAW_DATA_PATH + 'raw_data_04202021.csv', names=['status','keywords'], usecols=[1,2])

In [18]:
print(len(data))
data.head(2)

310711


Unnamed: 0,status,keywords
0,"{""place"": {""url"": ""https://api.twitter.com/1.1...",corona
1,"{""place"": {""url"": ""https://api.twitter.com/1.1...",covid


## Main Program

In [19]:
process_data(data)

In [20]:
user_df = pd.DataFrame.from_dict(user_dict, orient='index', columns=user_cols)
print(len(user_df))
user_df.head(2)

43581


Unnamed: 0,screen_name,name,description,location,created_at,protected,verified,geo_enabled,default_profile,statuses_count,favourites_count,friends_count,followers_count,listed_count,url,profile_image_url_https,profile_background_image_url_https
879728956542472196,BobWitkowsky,BobCat,Disillusioned over the current political clima...,Pennsylvania,Tue Jun 27 15:51:42 +0000 2017,False,False,True,False,9496,6006,5118,5317,3,,https://pbs.twimg.com/profile_images/105944798...,https://abs.twimg.com/images/themes/theme1/bg.png
2239734750,SheilaShowPHL,Sheila Hess,CITY REPRESENTATIVE @phillymayor. @PhillyCityR...,Philadelphia,Tue Dec 10 20:35:25 +0000 2013,False,True,True,False,14166,35627,4690,4657,80,,https://pbs.twimg.com/profile_images/908894334...,https://abs.twimg.com/images/themes/theme14/bg...


In [21]:
place_df = pd.DataFrame.from_dict(place_dict, orient='index', columns=place_cols)
print(len(place_df))
place_df.head(2)

2378


Unnamed: 0,place_type,place_country,place_state,place_city,place_name,bb_sw_lon,bb_sw_lat,bb_nw_lon,bb_nw_lat,bb_ne_lon,bb_ne_lat,bb_se_lon,bb_se_lat
9b977bdde8553e88,city,US,PA,Horsham,,-75.1688,40.1569,-75.1688,40.2118,-75.1069,40.2118,-75.1069,40.1569
e4a0d228eb6be76b,city,US,PA,Philadelphia,,-75.2803,39.8718,-75.2803,40.1379,-74.9557,40.1379,-74.9557,39.8718


In [22]:
quote_df = pd.DataFrame.from_dict(quote_dict, orient='index', columns=quote_display_cols)
print(len(quote_df))
quote_df.head(2)

42254


Unnamed: 0,text,created_at,favorite_count,retweet_count,quote_count,reply_count
1245757284002627584,This is what the corona virus would look like ...,Thu Apr 02 16:57:36 +0000 2020,3183,430,47,296
1245799250602217473,Sending our love to everyone ❤️ #crushcovid ht...,Thu Apr 02 19:44:21 +0000 2020,4280,610,94,112


In [23]:
#### NEED TO REPULL DATA FOR ALL UNIQUE MESSAGE IDS TO GET UPDATED ENGAGEMENT COUNTS ####
message_df = pd.DataFrame.from_dict(message_dict, orient='index', columns=message_display_cols)
print(len(message_df))
message_df.head(2)

308631


Unnamed: 0,user_id,place_id,quote_id,coordinates,text,created_at,favorite_count,retweet_count,quote_count,reply_count
1245866248635727872,879728956542472196,9b977bdde8553e88,1245757284002627584,,The corona viruse fears catching the Kelly Ann...,Fri Apr 03 00:10:35 +0000 2020,0,0,0,0
1245866546926243840,2239734750,e4a0d228eb6be76b,1245799250602217473,,#CrushCovid you said it @bryceharper3! 🙌 And y...,Fri Apr 03 00:11:46 +0000 2020,0,0,0,0


In [24]:
# Geo data
print('Number tweets with geo data:',len(message_df))
print('Number tweets with precise coordinates:',len(message_df[message_df['coordinates'].notnull()]))

Number tweets with geo data: 308631
Number tweets with precise coordinates: 22693


In [25]:
# Hashtags
print('Total number of hashtags used:',len(hashtag_data))
print('Number of unique hashtags:',len(hashtag_counter))
print('Number of Tweets with hashtags:',len(set(item[0] for item in hashtag_data)))

Total number of hashtags used: 261579
Number of unique hashtags: 58581
Number of Tweets with hashtags: 84294


In [26]:
# Top hashtags
num=20
print('Top',num,'hashtags:')
hashtag_counter.most_common(num)

Top 20 hashtags:


[('covid19', 14638),
 ('coronavirus', 5292),
 ('traffic', 2857),
 ('covid', 2439),
 ('wearamask', 2044),
 ('trumpvirus', 1829),
 ('philadelphia', 1750),
 ('stayhome', 1728),
 ('covid_19', 1650),
 ('quarantine', 1509),
 ('socialdistancing', 1450),
 ('staysafe', 1372),
 ('quarantinelife', 1341),
 ('pdoh', 1255),
 ('philly', 1243),
 ('covidー19', 1187),
 ('sdoh', 1088),
 ('trump', 1060),
 ('pandemic', 1031),
 ('maskup', 878)]

### Update favorite and retweet counts

In [32]:
# Initialize parameters
CHUNK = 100
status_updates = []

In [34]:
# Get status updates
for i in range(len(status_updates),len(message_df),CHUNK):
    id_list = message_df.iloc[i:i+CHUNK,:].index.to_list()
    status_updates += api.statuses_lookup(id_list)
    time.sleep(2)
    if i%10000 == 0:
        print(i, 'statuses processed...')

0 statuses processed...
10000 statuses processed...
20000 statuses processed...
30000 statuses processed...
40000 statuses processed...
50000 statuses processed...
60000 statuses processed...
70000 statuses processed...
80000 statuses processed...
90000 statuses processed...
100000 statuses processed...
110000 statuses processed...
120000 statuses processed...
130000 statuses processed...
140000 statuses processed...
150000 statuses processed...
160000 statuses processed...
170000 statuses processed...
180000 statuses processed...
190000 statuses processed...
200000 statuses processed...
210000 statuses processed...
220000 statuses processed...
230000 statuses processed...
240000 statuses processed...
250000 statuses processed...
260000 statuses processed...
270000 statuses processed...
280000 statuses processed...
290000 statuses processed...
300000 statuses processed...


KeyboardInterrupt: 

In [42]:
# Get update counts and update message df
count = 0
for item in status_updates:
    key = item.id_str
    favorite = item.favorite_count
    retweet = item.retweet_count
    message_df.at[key,'favorite_count'] = favorite
    message_df.at[key,'retweet_count'] = retweet
    if count%10000 == 0:
        print(count, 'statuses processed...')
    count += 1

0 statuses processed...
10000 statuses processed...
20000 statuses processed...
30000 statuses processed...
40000 statuses processed...
50000 statuses processed...
60000 statuses processed...
70000 statuses processed...
80000 statuses processed...
90000 statuses processed...
100000 statuses processed...
110000 statuses processed...
120000 statuses processed...
130000 statuses processed...
140000 statuses processed...
150000 statuses processed...
160000 statuses processed...
170000 statuses processed...
180000 statuses processed...
190000 statuses processed...
200000 statuses processed...
210000 statuses processed...
220000 statuses processed...
230000 statuses processed...
240000 statuses processed...
250000 statuses processed...
260000 statuses processed...


#### Augment Keyword Data

In [44]:
keyword_dict = dict()
not_found = []

# Add keyword categories
for item in keyword_data:
    if item[1].lower() in GENERAL_TERMS:
        item.append('general')
    elif item[1].lower() in FOOD_TERMS:
        item.append('food')
    elif item[1].lower() in POLITICAL_TERMS:
        item.append('political')
    elif item[1].lower() in STIGMA_TERMS:
        item.append('stigma')
    elif item[1].lower() in CONSPIRACY_TERMS:
        item.append('conspiracy')
    elif item[1].lower() in VACCINE_TERMS:
        item.append('vaccine')
    else:
        item.append('none')
        not_found.append(item)

# Create keyword data structures
keyword_df = pd.DataFrame(keyword_data, columns=['status_id','keyword','category'])
for key,group in keyword_df.groupby(['status_id']):
    keyword_dict[key] = {'keywords':set(group.keyword),'categories':set(group.category)}
    
# Filtered keyword lists
keywords_general = [item for item in keyword_data if item[2] == 'general']
keywords_general_ids = set(item[0] for item in keywords_general)

In [45]:
print(len(set([tuple(item) for item in keyword_data])))
print(len(keyword_data))
print(len(keyword_df))
keyword_df.head()

442088
459006
459006


Unnamed: 0,status_id,keyword,category
0,1245866248635727872,corona,general
1,1245866546926243840,covid,general
2,1245866702316769281,corona,general
3,1245866951127093248,coronavirus,general
4,1245866951127093248,corona,general


#### Hashtag Data

In [46]:
hashtag_df = pd.DataFrame(hashtag_data, columns=['status_id','hashtag'])

In [47]:
print(len(set([tuple(item) for item in hashtag_data])))
print(len(hashtag_data))
print(len(hashtag_df))
hashtag_df.head()

258146
261579
261579


Unnamed: 0,status_id,hashtag
0,1245866546926243840,crushcovid
1,1245866546926243840,hungerrelief
2,1245866546926243840,thankyouphilly
3,1245866702316769281,coronapocalypse
4,1245866702316769281,day22


#### Get Top Tweets

In [48]:
# NOTE1: some keywords are pulling invalid tweets - for example "ppe" is pulling aPPEaling and haPPEning
    # need to fix PPE keyword - add space before - ' ppe' instead of 'ppe'
# NOTE2: repeat items are discarded for dict data but not for list data
    # for example, duplicate messages are not in message df but appear in hashtag df (see code below to fix)
# NOTE3: keyword and hashtag fixes and augmentations can be added to the processing functions above
    
# Filter data
use_ids = set(keyword_df[keyword_df['keyword']!='ppe'].status_id)
use_df = message_df[message_df.index.isin(use_ids)]

# Top 20 favorites
fav_name = SUMMARY_DATA_PATH + 'top20_favorite_filtered_' + str(datetime.datetime.now().date()).replace('-','') + '.csv'
fav_df = use_df.sort_values('favorite_count', ascending=False).iloc[0:20,:]
fav_df.to_csv(fav_name)

# Top 20 retweets
re_name = SUMMARY_DATA_PATH + 'top20_retweet_filtered_' + str(datetime.datetime.now().date()).replace('-','') + '.csv'
re_df = use_df.sort_values('retweet_count', ascending=False).iloc[0:20,:]
re_df.to_csv(re_name)

In [49]:
hashtag_df[hashtag_df['status_id'].isin(fav_df.index)]

Unnamed: 0,status_id,hashtag
176637,1314953231697616897,bidenwillcrushcovid


In [50]:
vaccine_ids = set(keyword_df[keyword_df['category']=='vaccine'].status_id)
vaccine_df = message_df[message_df.index.isin(vaccine_ids)]

In [51]:
print(len(vaccine_df))
vaccine_df.head()

16396


Unnamed: 0,user_id,place_id,quote_id,coordinates,text,created_at,favorite_count,retweet_count,quote_count,reply_count
1338608164183429131,16379909,7c6845d4f5897da3,1.3380740617385452e+18,,"Last sign in this: “No Science, No Shutdown” P...",Mon Dec 14 22:13:51 +0000 2020,1,0,0,0
1338608371679842305,2941293160,e4a0d228eb6be76b,,,Anybody worried about the virus should get the...,Mon Dec 14 22:14:40 +0000 2020,57,20,0,0
1338609203024367616,411627211,e4a0d228eb6be76b,,,Me after my vaccine enjoying a blacked out cig...,Mon Dec 14 22:17:58 +0000 2020,17,0,0,0
1338609357546803201,1186424808663388160,3f5897b87d2bf56c,,,@SenTedCruz sir there should be a resalution b...,Mon Dec 14 22:18:35 +0000 2020,0,0,0,0
1338611087244210176,1146608039442755584,28573a4eb6bb7ae3,,,@HillaryClinton Hillary...will you take the va...,Mon Dec 14 22:25:28 +0000 2020,0,0,0,0


In [52]:
print(len(message_df))
print(len(use_df))

308631
247430


In [53]:
# Save base data
user_df.to_csv(MASTER_DATA_PATH + 'users_tristate_' + str(datetime.datetime.now().date()).replace('-','') + '.csv' )
place_df.to_csv(MASTER_DATA_PATH + 'places_tristate_' + str(datetime.datetime.now().date()).replace('-','') + '.csv' )
quote_df.to_csv(MASTER_DATA_PATH + 'quotes_tristate_' + str(datetime.datetime.now().date()).replace('-','') + '.csv' )

message_df.to_csv(MASTER_DATA_PATH + 'messages_tristate_ppe_' + str(datetime.datetime.now().date()).replace('-','') + '.csv' )
use_df.to_csv(MASTER_DATA_PATH + 'messages_tristate_' + str(datetime.datetime.now().date()).replace('-','') + '.csv' )
vaccine_df.to_csv(MASTER_DATA_PATH + 'messages_tristate_vaccine_' + str(datetime.datetime.now().date()).replace('-','') + '.csv' )

In [54]:
# Save auxiliary data
keyword_df.to_csv(MASTER_DATA_PATH + 'keywords_tristate_' + str(datetime.datetime.now().date()).replace('-','') + '.csv')
hashtag_df.to_csv(MASTER_DATA_PATH + 'hashtags_tristate_' + str(datetime.datetime.now().date()).replace('-','') + '.csv')

# Misc.

## Gensim LDA Testing
* Use mallet on WWBP server

In [46]:
import gensim
from gensim.test.utils import common_corpus

In [47]:
lda = gensim.models.ldamodel.LdaModel(common_corpus, num_topics=10)

In [70]:
for i, row in enumerate(lda[common_corpus]):
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    print(i)
    print(row)
    print()
    for j, (topic_num, prop_topic) in enumerate(row):
        print(j)
        print((topic_num, prop_topic))
        wp = lda.show_topic(topic_num)
        break
    break

0
[(2, 0.774994), (5, 0.025002684), (4, 0.025001716), (9, 0.02500145), (0, 0.02500002), (1, 0.02500002), (3, 0.02500002), (7, 0.02500002), (6, 0.025000019), (8, 0.025000019)]

0
(2, 0.774994)


## kw-first vs bb-first filtering

In [16]:
atomic_keys,composite_keys,composite_dict,second_level = get_dict_items(data)

In [40]:
# Get examples
key = ['quoted_status']
value = None
found=False
count=0
while(not found):
    status = get_example(data=data,query_key=key,shuffle=True)
    if status != None:
        found=True
    elif count >= 5:
        found=True
        print('Search failed!')
    count +=1
status==None

False

In [35]:
status['geo']['coordinates']

[39.74808776, -75.21749677]

In [36]:
status['coordinates']['coordinates']

[-75.21749677, 39.74808776]

In [42]:
status['quoted_status']['extended_tweet'].keys()

dict_keys(['display_text_range', 'entities', 'full_text', 'extended_entities'])

In [46]:
status['quoted_status']['extended_tweet']['entities'].keys()

dict_keys(['media', 'symbols', 'hashtags', 'user_mentions', 'urls'])

In [44]:
status['quoted_status']['entities'].keys()

dict_keys(['symbols', 'hashtags', 'user_mentions', 'urls'])

In [38]:
# Get possible values
values = get_values(data,'source',return_keys=False)
sorted(values)

['<a href="http://foursquare.com" rel="nofollow">Foursquare</a>',
 '<a href="http://instagram.com" rel="nofollow">Instagram</a>',
 '<a href="http://itunes.apple.com/us/app/twitter/id409789998?mt=12" rel="nofollow">Twitter for Mac</a>',
 '<a href="http://tapbots.com/tweetbot" rel="nofollow">Tweetbot for iΟS</a>',
 '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
 '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>',
 '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',
 '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 '<a href="http://ubersocial.com" rel="nofollow">UberSocial for Android</a>',
 '<a href="http://www.echofon.com/" rel="nofollow">Echofon</a>',
 '<a href="http://www.recruitology.com" rel="nofollow">Recruitology</a>',
 '<a href="http://www.squarespace.com" rel="nofollow">Squarespace</a>',
 '<a href="http://www.tweetcaster.com" rel="nofollow">TweetCaste

In [None]:
# Regarding kw-first filtering and retweets:
# identify location/coordinate source for RT's (coordinates, place info, user defined loc: possibly only user-defined)
# if loc/coord info, determne if in bb
# else, determine if posted by user from bb (via bb-first filtering)

## Unused / Deprecated Code

#### Notes:
* geo = lat/lon, coordinates = lon/lat, identical otherwise
    * (on map) right-->less negative, up-->more positive
* place:
    * bb coordinate order: sw,nw,ne,se
    * place types = 'admin', 'city', 'neighborhood', 'poi'
        * admin: state
        * city: city,state
        * neighborhood: name,city
        * poi: name

In [None]:
"""

# Get update counts
for item in status_updates:
    key = item.id_str
    favorite = item.favorite_count
    retweet = item.retweet_count
    results[key] = {'favorite_count':favorite,'retweet_count':retweet}

# update message_df
for key in results:
    message_df.loc[key,'favorite_count'] = results[key]['favorite_count']
    message_df.loc[key,'retweet_count'] = results[key]['retweet_count']

"""