In [1]:
# read only the needed users
import pandas as pd
ID = 7
df_users = pd.read_csv('users.txt', sep='\t', skiprows=lambda x: x % 50 != ID, header=None, names=['user_id'])
df_users.head()

Unnamed: 0,user_id
0,78
1,799
2,1389
3,1997
4,2768


In [2]:
# save the users in a set of easier fetching afterwards
USER_IDS = set(df_users.iloc[:, 0].values)
print(USER_IDS)

{106496, 1957889, 262164, 1949718, 417820, 1835037, 843809, 434212, 540708, 909348, 1196069, 589866, 966698, 868400, 32821, 958519, 1450055, 1851467, 78, 1720399, 1990739, 1302613, 41046, 16474, 565340, 163933, 1089629, 622688, 901221, 401511, 475247, 819316, 57463, 884864, 917641, 2039946, 1343627, 1376399, 1466511, 254097, 1081490, 1040531, 213145, 934062, 327857, 975025, 1417393, 1564850, 196790, 1130679, 811192, 1474752, 803013, 950471, 1319113, 344267, 229581, 1786071, 1818845, 1351911, 172264, 1515754, 368875, 246002, 352503, 499960, 680185, 786679, 1220856, 532733, 663807, 385288, 745743, 360721, 1278225, 1917207, 49445, 1524010, 237870, 74034, 1294644, 1687861, 1630526, 876863, 524608, 1261888, 713026, 1024324, 1442118, 1810761, 1458506, 704855, 794968, 1433945, 1507682, 115044, 295269, 688484, 2105704, 278892, 573806, 1147248, 1114482, 557437, 1032580, 90503, 999819, 1876367, 188817, 319890, 1048979, 426391, 123297, 770477, 1982896, 25015, 459200, 893376, 1491405, 1737166, 180

In [3]:
def filter_chunk_checking(data: pd.DataFrame, col_name, col_values):
    return data[data[col_name].isin(col_values)]

def read_checkings(file_name):
    # create a generator for the file
    file_reader = pd.read_csv(file_name, sep='\t', chunksize=1000000, names=['user_id', 'venue_id', 'utc_time', 'timezone_offset_mins'], encoding='utf-8')    
    # variable to save the result
    filtered_data = pd.DataFrame()
    # iterate through the chunks, filter each and append to the result
    for chunk in file_reader:
        filtered_data = pd.concat([filtered_data, filter_chunk_checking(chunk, col_name='user_id', col_values=USER_IDS)])

    return filtered_data

my_checkings = read_checkings('checkins_anonymized.txt')
# make sure the data is read correctly
assert set(list(my_checkings['user_id'].value_counts().index)) == USER_IDS

In [4]:
print(len(my_checkings))

449664


In [5]:
# I will only consider a small fraction of this dataset
# my_checkings = my_checkings.iloc[:222222, :]

In [6]:
def filter_chunk_friends(data: pd.DataFrame):
    return data[(data['user_id'].isin(USER_IDS)) & (data['friend_id'].isin(USER_IDS))]

def read_friends(file_name):
    # create a generator for the file
    file_reader = pd.read_csv(file_name, sep='\t', chunksize=10 ** 6, names=["user_id", "friend_id"], encoding='utf-8')    
    filtered_data = pd.DataFrame()
    # iterate through the chunks, filter each and append to the result
    for chunk in file_reader:
        filtered_data = pd.concat([filtered_data, filter_chunk_friends(chunk)])

    return filtered_data
    
my_friends_before = read_friends('friendship_before.txt')

# make sure the data is copied correctly
assert set(list(my_friends_before['user_id'].value_counts().index)).issubset(USER_IDS)
assert set(list(my_friends_before['friend_id'].value_counts().index)).issubset(USER_IDS)

my_friends_after = read_friends('friendship_after.txt')

assert set(list(my_friends_after['user_id'].value_counts().index)).issubset(USER_IDS)
assert set(list(my_friends_after['friend_id'].value_counts().index)).issubset(USER_IDS)


In [7]:
# prepare the venue data
VENUE_IDS = set(my_checkings['venue_id'].values)

def filter_chunk_venues(data: pd.DataFrame):
    return data[(data['venue_id'].isin(VENUE_IDS))]

def read_venues(file_name):
    # create a generator for the file
    file_reader = pd.read_csv(file_name, sep='\t', chunksize=10 ** 6, names=["venue_id", "latitude", "longitude", "category", "country"], encoding='utf-8')    
    filtered_data = pd.DataFrame()
    # iterate through the chunks, filter each and append to the result
    for chunk in file_reader:
        filtered_data = pd.concat([filtered_data, filter_chunk_venues(chunk)])

    return filtered_data

my_venues = read_venues('POIs.txt')

assert set(list(my_venues['venue_id'].value_counts().index)) == (VENUE_IDS)

In [8]:
# convert the data to datetime type
my_checkings['utc_time'] = pd.to_datetime(my_checkings['utc_time'])

In [9]:
# save the data
df_users.to_csv('my_users.tsv', sep='\t', index=False)
my_checkings.to_csv('my_checkins_anonymized.tsv', sep='\t', index=False)
my_friends_before.to_csv('my_friendship_before.tsv', sep='\t', index=False)
my_friends_after.to_csv('my_friendship_after.tsv', sep='\t', index=False)
my_venues.to_csv('my_POIs.tsv', sep='\t', index=False)
