In [1]:
import os
import sys
import lzma
import ujson as json
import numpy as np
import pandas as pd
import multiprocessing as mp
from timeit import default_timer as timer

In [2]:
######################## Parameters #############################
version_tweets = 'v4'

#how = 'by-geocoordinates'
how = 'by-account-location'

# For testing
# years = ['2016']
# months = ['12']
years = ["%.2d" % i for i in range(2012,2019)]
months = ["%.2d" % i for i in range(1,13)]

locations = sorted(['São Paulo'])
#############################################################

In [3]:
# Create Array of Input Files
def get_input_files(year=None,month=None):

    # Tweets Stored on Cluster 
    if os.getcwd() == '/home/sfraiberger/py':
        path_to_input_files  = '/net/twitter/gardenhose-data/json/'
        
    # For testing
    elif os.getcwd() == '/Users/samuel.fraiberger/Dropbox/Work/Projects/twitter/ipynb':
        path_to_input_files  = '../data/decahose/json/'
        
    else:
        sys.exit('Incorrect Working Directory... Exiting.')
    
    # Select all files with .xz extensions
    input_files = [file for file in os.listdir(path_to_input_files) if file[-3:]=='.xz']
    
    if year:
        input_files = [file for file in input_files if file.replace('.','-').split('-')[2]==year]

    if month:
        input_files = [file for file in input_files if file.replace('.','-').split('-')[3]==month]

    input_files = [path_to_input_files+file for file in input_files]
    
    # Randomize For Parallel Processing
    return list(np.random.permutation(input_files))

get_input_files(year=None,month=None)

['../data/decahose/json/tweets.json.2016-12-23.xz']

In [4]:
# Create Path to Output File
def get_output_file(version_tweets,year=None,month=None):
    
    path_to_output_files = '../data/decahose/parsed/tweets/'+how+'/'
    os.makedirs(path_to_output_files, exist_ok=True)
    
    output_file = 'tweets-'+how+'-from-decahose'
    
    if year:
        output_file += '-year-'+str(year)
        
    if month:
        output_file += '-month-'+str(month)
        
    output_file+='-'+version_tweets+'.pkl'
    
    return path_to_output_files+output_file

get_output_file(version_tweets,year=None,month=None)

'../data/decahose/parsed/tweets/by-account-location/tweets-by-account-location-from-decahose-v4.pkl'

In [5]:
def parse_tweets_by_geocoordinates(input_file):
    
    tweets  = []
    
    columns=[
    'TIME',
    'ID',
    'TEXT',
    'EXTENDED TEXT',
    'LANG',
    'LAT',
    'LON',
    'USER ID',
    'USER LOCATION',
    'USER UTC OFFSET',
    'USER TIME ZONE',
    'USER DESCRIPTION',
    'USER IMAGE URL',
    ]

    with lzma.open(input_file,'rb') as f:

        for line in f:

            # Only Select Tweets With Geocoordinates (Could Be in the Replies)
            if b'"coordinates":{' in line:
                
                # Json Parsing Can Fail
                try:
                    tweet = json.loads(line.decode("utf-8"))
                except:
                    continue

                # Only Selects If Geocoordinates in the Original Tweet (Not RT etc.)
                if tweet.get('coordinates',None):
                    
                    tweets.append([
                    tweet.get('created_at',None),
                    tweet.get('id_str',None),
                    tweet.get('text',None),
                    tweet.get('extended_tweet', {}).get('full_text', None),
                    tweet.get('lang',None),
                    tweet['coordinates']['coordinates'][1],
                    tweet['coordinates']['coordinates'][0], 
                    tweet.get('user', {}).get('id_str',None),
                    tweet.get('user', {}).get('location',None),
                    tweet.get('user', {}).get('utc_offset',None),
                    tweet.get('user', {}).get('time_zone',None),
                    tweet.get('user', {}).get('description',None),
                    tweet.get('user', {}).get('profile_image_url',None),
                    ])
                    
    return pd.DataFrame(tweets, columns=columns)

In [6]:
def parse_tweets_by_account_location(input_file):

    tweets = []

    columns = [
    'TIME',
    'ID',
    'TEXT',
    'LANG',
    'LAT',
    'LON',
#     'PLACE',
    'USER ID',
    'USER LOCATION',
#     'USER UTC OFFSET',
#     'USER TIME ZONE',
#     'USER DESCRIPTION',
#     'USER IMAGE URL',
    ]

    with lzma.open(input_file,'rb') as f:

        for line in f:

            if b',"location":' in line:

                # Json Parsing Can Fail
                try:
                    # Encoding Seems to Be Automatically Detected. 
                    tweet = json.loads(line)
                except:
                    continue

                # Self-reported Account Location
                location = tweet.get('user', {}).get('location',None)
                
                if location and location in locations:

                    text = tweet.get('text',None)
                    extended_text = tweet.get('extended_tweet', {}).get('full_text', None)
                    if extended_text:
                        text = extended_text

                    lat = None
                    lon = None
                    if tweet.get('coordinates', None):
                        lat = tweet['coordinates']['coordinates'][1]
                        lon = tweet['coordinates']['coordinates'][0]

                    tweets.append([
                    pd.to_datetime(tweet.get('created_at',None)),
                    tweet.get('id',None),
                    text,
                    tweet.get('lang',None),
                    lat,
                    lon,
#                     tweet.get('place', None),
                    tweet.get('user', {}).get('id',None),
                    location,
#                     tweet.get('user', {}).get('utc_offset',None),
#                     tweet.get('user', {}).get('time_zone',None),
#                     tweet.get('user', {}).get('description',None),
#                     tweet.get('user', {}).get('profile_image_url',None),
                    ])

    return pd.DataFrame(tweets, columns=columns).set_index('ID')

In [7]:
def main():
    
    for year in years:
        
        for month in months:
            
            start = timer()
            print('Year:', year)
            print('Month:', month)

            input_files = get_input_files(year,month)
            print('# Input Files:', len(input_files))
            
            output_file = get_output_file(version_tweets,year,month)
            print('Output File:', output_file)

            if not len(input_files):
                
                print('Skipped.')
                print()
                continue
                
            if os.path.exists(output_file):
                
                print('Output File Already Exists.')
                print()
                continue
    
            print('Parse Tweets', how.replace('-',' ').title())
            with mp.Pool() as pool:
                
                if how == 'by-geocoordinates':
                    
                    tweets = pd.concat(
                    pool.map(parse_tweets_by_geocoordinates, input_files)).reset_index(drop=True)
                    
                elif how == 'by-account-location':
                    
                    tweets = pd.concat(
                    pool.map(parse_tweets_by_account_location, input_files)).reset_index(drop=True)
                    
                else:
                    sys.exit('Parsing Error... Exit')
                    
            print('# Tweets:', tweets.shape[0])

            print('Save Tweets...')
            tweets.to_pickle(output_file,compression='xz')
            del tweets
            print('Done!')

            end = timer()
            print('Computing Time:', round(end - start), 'sec')
            print()
                
    return 0

In [8]:
start = timer()

if __name__ == "__main__":
    main()
    
end = timer()
print('Total Computing Time:', round(end - start), 'sec')

Year: 2016
Month: 12
# Input Files: 1
Output File: ../data/decahose/parsed/tweets/by-account-location/tweets-by-account-location-from-decahose-year-2016-month-12-v4.pkl
Parse Tweets By Account Location
# Tweets: 1470
Save Tweets...
Done!
Computing Time: 31 sec

Total Computing Time: 31 sec


In [14]:
pd.read_pickle(
'../data/decahose/parsed/tweets/by-geocoordinates/\
tweets-by-geocoordinates-from-decahose-year-2016-month-12-v4.pkl',
compression='xz').head()

Unnamed: 0,TIME,ID,TEXT,EXTENDED TEXT,LANG,LAT,LON,USER ID,USER LOCATION,USER UTC OFFSET,USER TIME ZONE,USER DESCRIPTION,USER IMAGE URL
0,Fri Dec 23 21:10:52 +0000 2016,812405135255158785,#Alleyway hey: #SantaClaus takes #BeverlyHills...,,en,34.101555,-118.338292,434359026,,,,,http://pbs.twimg.com/profile_images/6148176389...
1,Fri Dec 23 21:11:09 +0000 2016,812405206583414784,This #job might be a great fit for you: Sr Man...,,en,41.270982,-80.780541,100046855,"Akron, OH",-18000.0,Eastern Time (US & Canada),Follow this account for geo-targeted Hospitali...,http://pbs.twimg.com/profile_images/7007271713...
2,Fri Dec 23 21:11:18 +0000 2016,812405244298752005,"Goodbye blue, welcome back brunette 😍😍😍 #blue ...",,en,52.394,-0.535,70506221,"Crewe, England",0.0,London,Instagram & Snapchat: msalex0304,http://pbs.twimg.com/profile_images/7754360717...
3,Fri Dec 23 21:11:32 +0000 2016,812405303052419072,Want to work at J. Crew? We're #hiring in #Van...,,en,49.282729,-123.120738,28474308,Vancouver,-18000.0,Eastern Time (US & Canada),Follow this account for geo-targeted Retail jo...,http://pbs.twimg.com/profile_images/6858544067...
4,Fri Dec 23 21:11:33 +0000 2016,812405307213234176,Con mamá 😍 te amo 😍💞 @ San Clemente del Tuyú -...,,es,-36.36533,-56.716019,1155522932,San Clemente del Tuyú,-10800.0,Buenos Aires,Que sea rock !!,http://pbs.twimg.com/profile_images/8071016199...
