In [17]:
import torch
from torch import Tensor
import pandas as pd
import numpy as np
from torch_geometric.nn import Node2Vec
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os
from tqdm import tqdm
from numpy.linalg import norm
print(torch.__version__)

train_pth = "./Data/gowalla/train.txt"
test_pth = "./Data/gowalla/test.txt"

2.0.1+cu118


In [18]:
len_train_items = []

with open(train_pth) as f:
    for l in f.readlines():
        if len(l) > 0:
            l = l.strip('\n').split(' ')
            items = [int(i) for i in l[1:]]
            uid = int(l[0])
            len_train_items.append(len(items))
            
len_test_items = []

with open(test_pth) as f:
    for l in f.readlines():
        if len(l) > 0:
            l = l.strip('\n').split(' ')
            items = [int(i) for i in l[1:]]
            uid = int(l[0])
            len_test_items.append(len(items))

In [19]:
len_train_items = np.array(len_train_items)
len_test_items = np.array(len_test_items)

In [64]:
len_test_items/len_train_items

array([0.2519685 , 0.26530612, 0.26190476, ..., 0.27272727, 0.28125   ,
       0.25      ])

In [96]:
def save_dataframe(city, verbose=False, source = False, useSaved = True):
    
    check_path = f"/home/ttgn/tgn-master/data/TIST/df_" + city + ".csv"
    
    if os.path.isfile(check_path) and useSaved:
        df_review = pd.read_csv(check_path)
        return df_review
    
    # city_name must be in "London" format
    df_review = df_merged[df_merged['city_name']==city]
    
    USER_MIN_CONNECTIONS = 2
    BUSINESSES_MIN_CONNECTIONS = 20
    
#     if source:
#         USER_MIN_CONNECTIONS = 20
#         BUSINESSES_MIN_CONNECTIONS = 10
    
    print(USER_MIN_CONNECTIONS,BUSINESSES_MIN_CONNECTIONS)
    
    top_users = df_review['user_id'].value_counts()
    #print(len(top_users))
    top_users = set(list(top_users[top_users>=USER_MIN_CONNECTIONS].index))
    #print(len(top_users))
    df_review = df_review[(df_review['user_id'].isin(top_users))]
    
    if verbose:
        print("After removing inactive users :",df_review.shape)
    
    top_businesses = df_review['venue_id'].value_counts()
    #print(len(top_businesses))
    top_businesses = set(list(top_businesses[top_businesses>=BUSINESSES_MIN_CONNECTIONS].index))
    #print(len(top_businesses))
    df_review = df_review[(df_review['venue_id'].isin(top_businesses))]
    
    if verbose:
        print("After removing inactive venues",df_review.shape)
    
    df_review['utc'] = pd.to_datetime(df_review['utc'],errors = 'coerce')
    df_review['time_zone'] = pd.to_timedelta(df_review['time_zone'],'m')
    df_review['local_time'] = df_review['utc'] + df_review['time_zone']
    df_review.dropna(subset = 'local_time',inplace = True)
    
    start_time = min(df_review.local_time)
    
    if verbose:
        print("start time", start_time)
    df_review['ts'] = (df_review['local_time']-start_time).astype('timedelta64[h]')
    df_review['ts'] = (df_review['ts']*1).astype(int)
    df_review.sort_values('ts',inplace=True)
    
    df_review['u'] = pd.factorize(df_review['user_id'])[0] + 1
    df_review['i'] = pd.factorize(df_review['venue_id'])[0]+ 1 + max(df_review['u'])
    df_review['label'] = [1]*df_review.shape[0]
    df_review['idx'] = np.arange(1,df_review.shape[0]+1)
    
    df_review[['u','i','ts','label','idx']].to_csv("/home/ttgn/data/ml_4square_{}.csv".format(city),index=False)
    
    totalReviews = df_review.shape[0]
    embeddings = np.zeros((totalReviews,9))
    
    if verbose:
        print(embeddings.shape)
    
    empty = np.zeros(embeddings.shape[1])[np.newaxis,:]
    feat = np.vstack([empty,embeddings])
    
    if verbose:
        print(feat.shape)
    
    max_idx = max(df_review.u.max(), df_review.i.max())
    
    if verbose:
        print(max_idx)
    rand_feat = np.zeros((max_idx+1,embeddings.shape[1]))
    
    if verbose:
        print(rand_feat.shape)

    np.save("/home/ttgn/data/ml_4square_{}.npy".format(city), feat)
    np.save("/home/ttgn/data/ml_4square_{}_node.npy".format(city), rand_feat)
    
    #df_tmp.to_csv(f"/home/ttgn/data/TIST/df_" + city_name + ".csv")
    df_review.to_csv(f"/home/ttgn/tgn-master/data/TIST/df_" + city + ".csv")
    return df_review

In [117]:
def generate_data_ngcf(city_name):
    
    df_city = save_dataframe(city_name)
    df_city['u'] -= 1
    df_city['i'] -= min(df_city['i'])
    
    total_rows = len(df_city)
    first_10_percent = int(0.10 * total_rows)
    last_45_percent = int(0.45 * total_rows)
    
    df_train = df_city.head(first_10_percent)
    df_test = df_city.tail(last_45_percent)
    
    result = df_city.groupby('u')['i'].unique().apply(list).to_dict()
    
    result_train = df_train.groupby('u')['i'].apply(list).to_dict()
    result_test = df_test.groupby('u')['i'].apply(list).to_dict()
    
    train_data = result.copy()
    test_data = result.copy()

    for key, values in train_data.items():
        if key not in result_train:
            train_data[key] = []
        else:
            train_data[key] = result_train[key]
        
    for key, values in test_data.items():
        if key not in result_test:
            test_data[key] = []
        else:
            test_data[key] = result_test[key]


    folder_path = f'./Data/four-square-{city_name}/'
        
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)

    # Writing to train.txt
    with open(f'./Data/four-square-{city_name}/train.txt', 'w') as train_file:
        for key in sorted(train_data.keys()):
            train_file.write(f"{key} {' '.join(map(str, train_data[key]))}\n")

    # Writing to test.txt
    with open(f'./Data/four-square-{city_name}/test.txt',  'w') as test_file:
        for key in sorted(test_data.keys()):
            test_file.write(f"{key} {' '.join(map(str, test_data[key]))}\n")

In [118]:
for city in ["London","Brooklyn","Toronto","New York","Madrid","Los Angeles","Barcelona","Tokyo"]:
    generate_data_ngcf(city)
    
# "Brooklyn","Toronto","New York","Madrid","Los Angeles","Barcelona","Tokyo"

In [101]:
# def generate_data_ngcf(city_name):
    
#     df_city = save_dataframe(city_name)
#     df_city['u'] -= 1
#     df_city['i'] -= min(df_city['i'])
    
#     total_rows = len(df_city)
#     first_10_percent = int(0.10 * total_rows)
#     last_45_percent = int(0.45 * total_rows)
    
#     df_train = df_city.head(first_10_percent)
#     df_test = df_city.tail(last_45_percent)
    
#     result_train = df_train.groupby('u')['i'].unique().apply(list).to_dict()
#     result_test = df_test.groupby('u')['i'].unique().apply(list).to_dict()
    
#     print(len(result_train, result_test))
    
#     train_data = {}
#     test_data = {}

#     for key, values in result_train.items():
#         train_data[key] = values
        
#     for key, values in result_test.items():
#         test_data[key] = values


#     folder_path = f'./Data/four-square-{city_name}/'
        
#     if not os.path.exists(folder_path):
#         os.mkdir(folder_path)

#     # Writing to train.txt
#     with open(f'./Data/four-square-{city_name}/train.txt', 'w') as train_file:
#         for key in sorted(train_data.keys()):
#             train_file.write(f"{key} {' '.join(map(str, train_data[key]))}\n")

#     # Writing to test.txt
#     with open(f'./Data/four-square-{city_name}/test.txt',  'w') as test_file:
#         for key in sorted(test_data.keys()):
#             test_file.write(f"{key} {' '.join(map(str, test_data[key]))}\n")