In [1]:
from glob import glob
import pandas as pd
import numpy as np
from timeit import default_timer as timer
from itertools import chain

In [2]:
def groupby_apply(df, func, sort=True):
    
    # Groupby name_key then apply func on the values in name_value
    # Speed up pandas groupby a fractor of 10 ;)
    
    # Input a dataframe with columns [name_key, name_value]
    name_key, name_value = df.columns
    
    if sort:
        keys, values = df.sort_values(by=name_key).values.T
    else:
        keys, values = df.values.T
        
    # Extract Unique Keys And Index Of Transitions
    unique_keys, index = np.unique(keys, True)
    
    # Split Values According to Transitions
    arrays = np.split(values, index[1:])
    
    # Apply func to each array of values corresponding to a given key
    return pd.Series([func(a) for a in arrays], name=name_value, index=pd.Index(unique_keys,name=name_key))

In [3]:
path_to_data = '../data/decahose/parsed/'

In [4]:
input_files = glob(path_to_data+'users/user-id-and-location-from-decahose*')
print('# Input Files:', len(input_files))

# Input Files: 2


In [5]:
account_locations = pd.read_pickle(
path_to_data+'locations/account-locations-identified.pkl')['LOCATION'].tolist()
print('# Account Locations:', len(account_locations))
print('First 10:')
print(', '.join(account_locations[:10]))

# Account Locations: 39779
First 10:
Indonesia, London, Brasil, Jakarta, Philippines, İstanbul, istanbul, indonesia, Argentina, Bandung


In [6]:
def get_users_by_account_location(input_file):
    
    # Import
    data = pd.read_pickle(input_file,compression='xz')
    
    # Select Users With Identified Account Location
    data = data[data['USER LOCATION'].isin(account_locations)].copy()
    
    # Sort Before Grouping
    data.sort_values(by='USER LOCATION',inplace=True)

    # Group By Account Location
    return groupby_apply(data[['USER LOCATION','USER ID']],lambda x:set(x),sort=False)

In [7]:
def main():

    for i,input_file in enumerate(sorted(input_files)):
        
        start = timer()

        print()
        print('File:', i, input_file)

        if not i:

            all_locations = get_users_by_account_location(input_file)

        else:

            all_locations = pd.concat([all_locations, 
            get_users_by_account_location(input_file)]).sort_index().reset_index()

            print('# All Locations after Concatenation:', all_locations.shape[0])

            all_locations = groupby_apply(all_locations, lambda x:set(chain.from_iterable(x)),sort=False)

            print('# All Locations after Reduction:', all_locations.shape[0])

        print('# All Locations:', all_locations.shape[0])
        
        print("Done in", round(timer()-start), "sec")
    
    print('Save:')
    start = timer()
    
    all_locations.to_pickle(path_to_data+'users/users-by-account-location.pkl.xz',compression='xz')
    
    print("Done in", round(timer()-start), "sec")

    return 0

In [8]:
print("Concatenate Files...")
start = timer()

if __name__ == "__main__":
    main()
    
end = timer()
print()
print('Total Computing Time:', round(end - start), 'Sec')

Concatenate Files...

File: 0 ../data/decahose/parsed/users/users-from-decahose-partition-0-block-0.pkl.xz
# All Locations: 13972
Done in 1 sec

File: 1 ../data/decahose/parsed/users/users-from-decahose-partition-0-block-1.pkl.xz
# All Locations after Concatenation: 27944
# All Locations after Reduction: 13972
# All Locations: 13972
Done in 1 sec
Save:
Done in 1 sec

Total Computing Time: 3 Sec


In [9]:
pd.read_pickle(path_to_data+'users/users-by-account-location.pkl.xz',compression='xz').head()

USER LOCATION
 Argentina    {2174749663, 777320118133153792, 467394687, 16...
 Australia                      {37227993, 235161650, 78787098}
 Brasil       {766491773644439552, 3013048749, 7902763772578...
 Brasil                                             {320400603}
 Brazil                                              {58787130}
Name: USER ID, dtype: object