# 1. Data Loader

In [None]:
import numpy as np
from datetime import datetime
import pandas as pd

## 1.1. Load Data

> This function will be used in the `preprocess.py`, and will in turn generate `train file`, `test file` and `validation file`.

### 1.1.1. Preparing for the File

In [None]:
file = 'gtd.txt'
f = open(file, 'r')
lines = f.readlines()

### 1.1.2. Variable Initializations

In [None]:
user2id, poi2id = {}, {}
train_user, train_time, train_lat, train_lon, train_loc = [], [], [], [], []
valid_user, valid_time, valid_lat, valid_lon, valid_loc = [], [], [], [], []
test_user, test_time, test_lat, test_lon, test_loc = [], [], [], [], []
user_time, user_lat, user_lon, user_loc = [], [], [], []
attack_threshold = 30

### 1.1.3. Select Eligible Users and Creat `user2id` Dictionary

In [None]:
# The next line is to obtain the user id.
prev_user = int(lines[0].split('\t')[0])
attack_cnt = 0

for i, line in enumerate(lines):
    # The next line is to convert the splited line into a list.
    tokens = line.strip().split('\t')
    # The next line obtains the original user id of the current line.
    user = int(tokens[0])
    if user == prev_user:
        attack_cnt += 1
    # This branch is effective when the line represents the next user.
    else:
        # This is to create a map from original id to new id.
        # Only considers users having more records than the threshold
        if attack_cnt >= attack_threshold:
            user2id[prev_user] = len(user2id)
        # This is to re-initiate the prev_user and attack_cnt
        prev_user = user
        attack_cnt = 1 

### 1.1.4. Create the lists: _user, _lat, _lon, _loc

In [None]:
prev_user = int(lines[0].split('\t')[0])
for i, line in enumerate(lines):
    tokens = line.strip().split('\t')
    user = user2id.get(int(tokens[0]))
    # The next line is to get rid of the users
    # who have less than 30 records
    if user is None:
        continue
    
    # Now, we will only deal with users with more than 30 records.
    time = (datetime.strptime(tokens[1], "%Y-%m-%d") - datetime(1970, 1, 1)).days
    lat, lon, location = tokens[2], tokens[3], tokens[4]
    
    # The next line creates the poi2id dictionary.
    # It maps the existing location id to a new id.
    # The new id is defined upon the order of the appearance.
    # You can view it as simply rename location id.
    if poi2id.get(location) is None:
        poi2id[location] = len(poi2id)
    loc = poi2id.get(location)
    
    # When the user is the previous one,
    # Just add his attributes into lists accordingly.
    # Note that our file is ordered by user id.
    if user == prev_user:
        user_time.insert(0, time)
        user_lat.insert(0, lat)
        user_lon.insert(0, lon)
        user_loc.insert(0, loc)
    # We will update train / valid / test lists nnce new user appears.
    # To illustrate, each element in train_time is a list containing
    # the first 70% time record of a user. Others are similarly defined.
    # Note that (i == len(lines) - 1) is the corner case for the last one
    # Or it won't be included in the train/test/valid lists.
    if (user != prev_user) or (i == len(lines) - 1):
        train_threshold = int(len(user_time) * 0.7)
        valid_threshold = int(len(user_time) * 0.8)
            
        train_user.append(user)
        train_time.append(user_time[:train_threshold])
        train_lat.append(user_lat[:train_threshold])
        train_lon.append(user_lon[:train_threshold])
        train_loc.append(user_loc[:train_threshold])
            
        valid_user.append(user)
        valid_time.append(user_time[train_threshold:valid_threshold])
        valid_lat.append(user_lat[train_threshold:valid_threshold])
        valid_lon.append(user_lon[train_threshold:valid_threshold])
        valid_loc.append(user_loc[train_threshold:valid_threshold])
            
        test_user.append(user)
        test_time.append(user_time[valid_threshold:])
        test_lat.append(user_lat[valid_threshold:])
        test_lon.append(user_lon[valid_threshold:])
        test_loc.append(user_loc[valid_threshold:])

        prev_user = user
        user_time = [time]
        user_lat = [lat]
        user_lon = [lon]
        user_loc = [loc]      

f.close()

### 1.1.4. Function Formulating

In [None]:
def load_data(source_file):
    f = open(source_file, 'r')
    lines = f.readlines()
    
    user2id, poi2id = {}, {}
    train_user, train_time, train_lat, train_lon, train_loc = [], [], [], [], []
    valid_user, valid_time, valid_lat, valid_lon, valid_loc = [], [], [], [], []
    test_user, test_time, test_lat, test_lon, test_loc = [], [], [], [], []
    user_time, user_lat, user_lon, user_loc = [], [], [], []
    attack_threshold = 30
    
    # The next line is to obtain the user id.
    prev_user = int(lines[0].split('\t')[0])
    attack_cnt = 0

    for i, line in enumerate(lines):
        # The next line is to convert the splited line into a list.
        tokens = line.strip().split('\t')
        # The next line obtains the original user id of the current line.
        user = int(tokens[0])
        if user == prev_user:
            attack_cnt += 1
        # This branch is effective when the line represents the next user.
        else:
            # This is to create a map from original id to new id.
            # Only considers users having more records than the threshold
            if attack_cnt >= attack_threshold:
                user2id[prev_user] = len(user2id)
            # This is to re-initiate the prev_user and attack_cnt
            prev_user = user
            attack_cnt = 1 

    prev_user = int(lines[0].split('\t')[0])
    for i, line in enumerate(lines):
        tokens = line.strip().split('\t')
        user = user2id.get(int(tokens[0]))
        # The next line is to get rid of the users
        # who have less than 30 records
        if user is None:
            continue

        # Now, we will only deal with users with more than 30 records.
        time = (datetime.strptime(tokens[1], "%Y-%m-%d") - datetime(1970, 1, 1)).days
        lat, lon, location = float(tokens[2]), float(tokens[3]), float(tokens[4])

        # The next line creates the poi2id dictionary.
        # It maps the existing location id to a new id.
        # The new id is defined upon the order of the appearance.
        # You can view it as simply rename location id.
        if poi2id.get(location) is None:
            poi2id[location] = len(poi2id)
        loc = poi2id.get(location)

        # When the user is the previous one,
        # Just add his attributes into lists accordingly.
        # Note that our file is ordered by user id.
        if user == prev_user:
            user_time.insert(0, time)
            user_lat.insert(0, float(lat))
            user_lon.insert(0, float(lon))
            user_loc.insert(0, float(loc))
        # We will update train / valid / test lists nnce new user appears.
        # To illustrate, each element in train_time is a list containing
        # the first 70% time record of a user. Others are similarly defined.
        # Note that (i == len(lines) - 1) is the corner case for the last one
        # Or it won't be included in the train/test/valid lists.
        if (user != prev_user) or (i == len(lines) - 1):
            train_threshold = int(len(user_time) * 0.7)
            valid_threshold = int(len(user_time) * 0.8)

            train_user.append(user)
            train_time.append(user_time[:train_threshold])
            train_lat.append(user_lat[:train_threshold])
            train_lon.append(user_lon[:train_threshold])
            train_loc.append(user_loc[:train_threshold])

            valid_user.append(user)
            valid_time.append(user_time[train_threshold:valid_threshold])
            valid_lat.append(user_lat[train_threshold:valid_threshold])
            valid_lon.append(user_lon[train_threshold:valid_threshold])
            valid_loc.append(user_loc[train_threshold:valid_threshold])

            test_user.append(user)
            test_time.append(user_time[valid_threshold:])
            test_lat.append(user_lat[valid_threshold:])
            test_lon.append(user_lon[valid_threshold:])
            test_loc.append(user_loc[valid_threshold:])

            prev_user = user
            user_time = [time]
            user_lat = [lat]
            user_lon = [lon]
            user_loc = [loc]      

    f.close()
    
    return len(user2id), poi2id, \
           train_user, train_time, train_lat, train_lon, train_loc, \
           valid_user, valid_time, valid_lat, valid_lon, valid_loc, \
           test_user, test_time, test_lat, test_lon, test_loc

# 2. Preprocess

In [None]:
import os
import datetime
import csv
import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import data_loader

## 2.1. Define Parameters

In [None]:
ftype = torch.cuda.FloatTensor
ltype = torch.cuda.LongTensor

## 2.2. Load Data

In [None]:
source_file = 'gtd.txt'

In [None]:
print('Loading data...')
user_cnt, poi2id, \
train_user, train_time, train_lati, train_longi, train_loc, \
valid_user, valid_time, valid_lati, valid_longi, valid_loc, \
test_user, test_time, test_lati, test_longi, test_loc = data_loader.\
                                                        load_data(source_file)
print('Data loaded successfully!')

In [None]:
print('User /Location: {:d} / {:d}'.format(user_cnt, len(poi2id)))
print('=======================================================')

## 2.3 Define Hyperparameters for the Model

In [None]:
dim = 7
ww = 30
up_time = 120
lw_time = 5   
up_dist = 100   
lw_dist = 1

## 2.4 Define Training Parameters

In [None]:
batch_size = 2
learning_rate = 0.001
momentum = 0.9
evaluate_every = 1

## 2.5 Define the ST-RNN Module

In [None]:
class STRNNModule(nn.Module):
    def __init__(self):
        super(STRNNModule, self).__init__()
        
        # Embeddings
        self.user_weight = Variable(torch.randn(user_cnt, dim), requires_grad=False).type(ftype)
        self.h_0 = Variable(torch.randn(dim, 1), requires_grad=False).type(ftype)
        self.location_weight = nn.Embedding(len(poi2id), dim)
        self.perm_weight = nn.Embedding(user_cnt, dim)
        
        # Attributes
        self.time_upper = nn.Parameter(torch.randn(dim, dim).type(ftype))
        self.time_lower = nn.Parameter(torch.randn(dim, dim).type(ftype))
        self.dist_upper = nn.Parameter(torch.randn(dim, dim).type(ftype))
        self.dist_lower = nn.Parameter(torch.randn(dim, dim).type(ftype))
        self.C = nn.Parameter(torch.randn(dim, dim).type(ftype))

        # Modules
        self.sigmoid = nn.Sigmoid()

    # Find the most closest value to w, w_cap(index)
    def find_w_cap(self, times, i):
        trg_t = times[i] - ww
        tmp_t = times[i]
        tmp_i = i - 1
        for idx, t_w in enumerate(reversed(times[:i]), start=1):
            if t_w.data.cpu().numpy() == trg_t.data.cpu().numpy():
                return i-idx
            elif t_w.data.cpu().numpy() > trg_t.data.cpu().numpy():
                tmp_t = t_w
                tmp_i = i-idx
            elif t_w.data.cpu().numpy() < trg_t.data.cpu().numpy():
                if trg_t.data.cpu().numpy() - t_w.data.cpu().numpy() \
                    < tmp_t.data.cpu().numpy() - trg_t.data.cpu().numpy():
                    return i-idx
                else:
                    return tmp_i
        return 0

    def return_h_tw(self, times, latis, longis, locs, idx):
        w_cap = self.find_w_cap(times, idx)
        if w_cap is 0:
            return self.h_0
        else:
            self.return_h_tw(times, latis, longis, locs, w_cap)

        lati = latis[idx] - latis[w_cap:idx]
        longi = longis[idx] - longis[w_cap:idx]
        td = times[idx] - times[w_cap:idx]
        ld = self.euclidean_dist(lati, longi)

        data = ','.join(str(e) for e in td.data.cpu().numpy()) + '\t'
        f.write(data)
        data = ','.join(str(e) for e in ld.data.cpu().numpy()) + '\t'
        f.write(data)
        data = ','.join(str(e.data.cpu().numpy()[0]) for e in locs[w_cap:idx]) + '\t'
        f.write(data)
        data = str(locs[idx].data.cpu().numpy()[0]) + '\n'
        f.write(data)

    # get transition matrices by linear interpolation
    def get_location_vector(self, td, ld, locs):
        tud = up_time - td
        tdd = td - lw_time
        lud = up_dist - ld
        ldd = ld - lw_dist
        loc_vec = 0
        for i in xrange(len(tud)):
            Tt = torch.div(torch.mul(self.time_upper, tud[i]) + torch.mul(self.time_lower, tdd[i]),
                            tud[i]+tdd[i])
            Sl = torch.div(torch.mul(self.dist_upper, lud[i]) + torch.mul(self.dist_lower, ldd[i]),
                            lud[i]+ldd[i])
            loc_vec += torch.mm(Sl, torch.mm(Tt, torch.t(self.location_weight(locs[i]))))
        return loc_vec

    def euclidean_dist(self, x, y):
        return torch.sqrt(torch.pow(x, 2) + torch.pow(y, 2))

    def forward(self, user, times, latis, longis, locs, step):
        f.write(str(user.data.cpu().numpy()[0])+"\n")
        # positive sampling
        pos_h = self.return_h_tw(times, latis, longis, locs, len(times)-1)

## 2.6. Define the Run Function

In [None]:
def run(user, time, lati, longi, loc, step):

    user = Variable(torch.from_numpy(np.asarray([user]))).type(ltype)
    time = Variable(torch.from_numpy(np.asarray(time))).type(ftype)
    lati = Variable(torch.from_numpy(np.asarray(lati))).type(ftype)
    longi = Variable(torch.from_numpy(np.asarray(longi))).type(ftype)
    loc = Variable(torch.from_numpy(np.asarray(loc))).type(ltype)

    rnn_output = strnn_model(user, time, lati, longi, loc, step)

## 2.7 Run the Model and Write into the File

In [None]:
strnn_model = STRNNModule().cuda()

print("Making train file...")
f = open("/prepro_train_%s.txt"%lw_time, 'w')
# Training
train_batches = list(zip(train_time, train_lati, train_longi, train_loc))
for j, train_batch in enumerate(tqdm.tqdm(train_batches, desc="train")):
    batch_time, batch_lati, batch_longi, batch_loc = train_batch
    run(train_user[j], batch_time, batch_lati, batch_longi, batch_loc, step=1)
f.close()

print("Making valid file...")
f = open("/prepro_valid_%s.txt"%lw_time, 'w')
# Eavludating
valid_batches = list(zip(valid_time, valid_lati, valid_longi, valid_loc))
for j, valid_batch in enumerate(tqdm.tqdm(valid_batches, desc="valid")):
    batch_time, batch_lati, batch_longi, batch_loc = valid_batch
    run(valid_user[j], batch_time, batch_lati, batch_longi, batch_loc, step=2)
f.close()

print("Making test file...")
f = open("/prepro_test_%s.txt"%lw_time, 'w')
# Testing
test_batches = list(zip(test_time, test_lati, test_longi, test_loc))
for j, test_batch in enumerate(tqdm.tqdm(test_batches, desc="test")):
    batch_time, batch_lati, batch_longi, batch_loc = test_batch
    run(test_user[j], batch_time, batch_lati, batch_longi, batch_loc, step=3)
f.close()

# 3. Treat Preprocess

> Note that this function will not only works on the train file, but also on the valid and test file.

In [None]:
def treat_prepro(train, step):
    train_f = open(train, 'r')
    if step==1:
        lines = train_f.readlines()
    elif step==2:
        lines = train_f.readlines()
    elif step==3:
        lines = train_f.readlines()

    train_user = []
    train_td = []
    train_ld = []
    train_loc = []
    train_dst = []

    user = 1
    user_td = []
    user_ld = []
    user_loc = []
    user_dst = []

    for i, line in enumerate(lines):
        tokens = line.strip().split('\t')
        if len(tokens) < 3:
            if user_td: 
                train_user.append(user)
                train_td.append(user_td)
                train_ld.append(user_ld)
                train_loc.append(user_loc)
                train_dst.append(user_dst)
            user = int(tokens[0])
            user_td = []
            user_ld = []
            user_loc = []
            user_dst = []
            continue
        td = np.array([float(t) for t in tokens[0].split(',')])
        ld = np.array([float(t) for t in tokens[1].split(',')])
        loc = np.array([int(t) for t in tokens[2].split(',')])
        dst = int(tokens[3])
        user_td.append(td)
        user_ld.append(ld)
        user_loc.append(loc)
        user_dst.append(dst)

    if user_td: 
        train_user.append(user)
        train_td.append(user_td)
        train_ld.append(user_ld)
        train_loc.append(user_loc)
        train_dst.append(user_dst)

    return train_user, train_td, train_ld, train_loc, train_dst

# 4. Model Training and Evaluation

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
#! /usr/bin/env python

import os
import datetime
import math
import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import data_loader

# Parameters
# ==================================================
ftype = torch.cuda.FloatTensor
ltype = torch.cuda.LongTensor

# Data loading params
train_file = "prepro_train_50.txt"
valid_file = "prepro_valid_50.txt"
test_file = "prepro_test_50.txt"

# Model Hyperparameters
dim = 7    # dimensionality
ww = 30  # winodw width (6h)
up_time = 365  # day
lw_time = 0.
up_dist = 457.335   # km
lw_dist = 0.
reg_lambda = 0.1

# Training Parameters
batch_size = 2
num_epochs = 30
learning_rate = 0.001
momentum = 0.9
evaluate_every = 1
h_0 = Variable(torch.randn(dim, 1), requires_grad=False).type(ftype)

user_cnt = 294 
loc_cnt = 28442


try:
    xrange
except NameError:
    xrange = range

# Data Preparation
# ===========================================================
# Load data
print("Loading data...")
train_user, train_td, train_ld, train_loc, train_dst = data_loader.treat_prepro(train_file, step=1)
valid_user, valid_td, valid_ld, valid_loc, valid_dst = data_loader.treat_prepro(valid_file, step=2)
test_user, test_td, test_ld, test_loc, test_dst = data_loader.treat_prepro(test_file, step=3)

print("User/Location: {:d}/{:d}".format(user_cnt, loc_cnt))
print("==================================================================================")

class STRNNCell(nn.Module):
    def __init__(self, hidden_size):
        super(STRNNCell, self).__init__()
        self.hidden_size = hidden_size
        self.weight_ih = nn.Parameter(torch.Tensor(hidden_size, hidden_size)) # C
        self.weight_th_upper = nn.Parameter(torch.Tensor(hidden_size, hidden_size)) # T
        self.weight_th_lower = nn.Parameter(torch.Tensor(hidden_size, hidden_size)) # T
        self.weight_sh_upper = nn.Parameter(torch.Tensor(hidden_size, hidden_size)) # S
        self.weight_sh_lower = nn.Parameter(torch.Tensor(hidden_size, hidden_size)) # S

        self.location_weight = nn.Embedding(loc_cnt, hidden_size)
        self.permanet_weight = nn.Embedding(user_cnt, hidden_size)

        self.sigmoid = nn.Sigmoid()

        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)

    def forward(self, td_upper, td_lower, ld_upper, ld_lower, loc, hx):
        loc_len = len(loc)
        Ttd = [((self.weight_th_upper*td_upper[i] + self.weight_th_lower*td_lower[i])\
                /(td_upper[i]+td_lower[i])) for i in xrange(loc_len)]
        Sld = [((self.weight_sh_upper*ld_upper[i] + self.weight_sh_lower*ld_lower[i])\
                /(ld_upper[i]+ld_lower[i])) for i in xrange(loc_len)]

        loc = self.location_weight(loc).view(-1,self.hidden_size,1)
        loc_vec = torch.sum(torch.cat([torch.mm(Sld[i], torch.mm(Ttd[i], loc[i]))\
                .view(1,self.hidden_size,1) for i in xrange(loc_len)], dim=0), dim=0)
        usr_vec = torch.mm(self.weight_ih, hx)
        hx = loc_vec + usr_vec # hidden_size x 1
        return self.sigmoid(hx)

    def loss(self, user, td_upper, td_lower, ld_upper, ld_lower, loc, dst, hx):
        h_tq = self.forward(td_upper, td_lower, ld_upper, ld_lower, loc, hx)
        p_u = self.permanet_weight(user)
        q_v = self.location_weight(dst)
        output = torch.mm(q_v, (h_tq + torch.t(p_u)))

        return torch.log(1+torch.exp(torch.neg(output)))

    def validation(self, user, td_upper, td_lower, ld_upper, ld_lower, loc, dst, hx):
        # error exist in distance (ld_upper, ld_lower)
        h_tq = self.forward(td_upper, td_lower, ld_upper, ld_lower, loc, hx)
        p_u = self.permanet_weight(user)
        user_vector = h_tq + torch.t(p_u)
        ret = torch.mm(self.location_weight.weight, user_vector).data.cpu().numpy()
        return np.argsort(np.squeeze(-1*ret))

###############################################################################################
def parameters():
    params = []
    for model in [strnn_model]:
        params += list(model.parameters())

    return params

def print_score(batches, step):
    recall1 = 0.
    recall5 = 0.
    recall10 = 0.
    recall100 = 0.
    recall1000 = 0.
    recall10000 = 0.
    iter_cnt = 0

    for batch in tqdm.tqdm(batches, desc="validation"):
        batch_user, batch_td, batch_ld, batch_loc, batch_dst = batch
        if len(batch_loc) < 3:
            continue
        iter_cnt += 1
        batch_o, target = run(batch_user, batch_td, batch_ld, batch_loc, batch_dst, step=step)

        recall1 += target in batch_o[:1]
        recall5 += target in batch_o[:5]
        recall10 += target in batch_o[:10]
        recall100 += target in batch_o[:100]
        recall1000 += target in batch_o[:1000]
        recall10000 += target in batch_o[:10000]

    print("recall@1: ", recall1/iter_cnt)
    print("recall@5: ", recall5/iter_cnt)
    print("recall@10: ", recall10/iter_cnt)
    print("recall@100: ", recall100/iter_cnt)
    print("recall@1000: ", recall1000/iter_cnt)
    print("recall@10000: ", recall10000/iter_cnt)

###############################################################################################
def run(user, td, ld, loc, dst, step):

    optimizer.zero_grad()

    seqlen = len(td)
    user = Variable(torch.from_numpy(np.asarray([user]))).type(ltype)

    #neg_loc = Variable(torch.FloatTensor(1).uniform_(0, len(poi2pos)-1).long()).type(ltype)
    #(neg_lati, neg_longi) = poi2pos.get(neg_loc.data.cpu().numpy()[0])
    rnn_output = h_0
    for idx in range(seqlen - 1):
        print(idx, up_time, td[idx])
        td_upper = Variable(torch.from_numpy(np.asarray(up_time-td[idx]))).type(ftype)
        td_lower = Variable(torch.from_numpy(np.asarray(td[idx]-lw_time))).type(ftype)
        ld_upper = Variable(torch.from_numpy(np.asarray(up_dist-ld[idx]))).type(ftype)
        ld_lower = Variable(torch.from_numpy(np.asarray(ld[idx]-lw_dist))).type(ftype)
        location = Variable(torch.from_numpy(np.asarray(loc[idx]))).type(ltype)
        rnn_output = strnn_model(td_upper, td_lower, ld_upper, ld_lower, location, rnn_output)#, neg_lati, neg_longi, neg_loc, step)

    td_upper = Variable(torch.from_numpy(np.asarray(up_time-td[-1]))).type(ftype)
    td_lower = Variable(torch.from_numpy(np.asarray(td[-1]-lw_time))).type(ftype)
    ld_upper = Variable(torch.from_numpy(np.asarray(up_dist-ld[-1]))).type(ftype)
    ld_lower = Variable(torch.from_numpy(np.asarray(ld[-1]-lw_dist))).type(ftype)
    location = Variable(torch.from_numpy(np.asarray(loc[-1]))).type(ltype)

    if step > 1:
        return strnn_model.validation(user, td_upper, td_lower, ld_upper, ld_lower, location, dst[-1], rnn_output), dst[-1]

    destination = Variable(torch.from_numpy(np.asarray([dst[-1]]))).type(ltype)
    J = strnn_model.loss(user, td_upper, td_lower, ld_upper, ld_lower, location, destination, rnn_output)#, neg_lati, neg_longi, neg_loc, step)

    J.backward()
    optimizer.step()

    return J.data.cpu().numpy()

###############################################################################################
strnn_model = STRNNCell(dim).cuda()
optimizer = torch.optim.SGD(parameters(), lr=learning_rate, momentum=momentum, weight_decay=reg_lambda)

for i in range(num_epochs):
    # Training
    total_loss = 0.
    train_batches = list(zip(train_user, train_td, train_ld, train_loc, train_dst))
    for j, train_batch in enumerate(tqdm.tqdm(train_batches, desc="train")):
        #inner_batches = data_loader.inner_iter(train_batch, batch_size)
        #for k, inner_batch in inner_batches:
        batch_user, batch_td, batch_ld, batch_loc, batch_dst = train_batch#inner_batch)
        if len(batch_loc) < 3:
            continue
        total_loss += run(batch_user, batch_td, batch_ld, batch_loc, batch_dst, step=1)
        #if (j+1) % 2000 == 0:
        #    print("batch #{:d}: ".format(j+1)), "batch_loss :", total_loss/j, datetime.datetime.now()
    # Evaluation
    if (i + 1) % evaluate_every == 0:
        print("==================================================================================")
        #print("Evaluation at epoch #{:d}: ".format(i+1)), total_loss/j, datetime.datetime.now()
        valid_batches = list(zip(valid_user, valid_td, valid_ld, valid_loc, valid_dst))
        print_score(valid_batches, step=2)

# Testing
print("Training End..")
print("==================================================================================")
print("Test: ")
test_batches = list(zip(test_user, test_td, test_ld, test_loc, test_dst))
print_score(test_batches, step=3)


## 1.2 Treat Preprocessing

> This function will be used in the `train_torch.py`, to generate `x_user`, `x_td`, `x_ld`, `x_loc`, `x_dst`, where x means train, test, or valid.

In [None]:
def treat_prepro(train, step):
    train_f = open(train, 'r')
    # Need to change depending on threshold
    if step==1:
        lines = train_f.readlines()#[:86445] #659 #[:309931]
    elif step==2:
        lines = train_f.readlines()#[:13505]#[:309931]
    elif step==3:
        lines = train_f.readlines()#[:30622]#[:309931]

    train_user = []
    train_td = []
    train_ld = []
    train_loc = []
    train_dst = []

    user = 1
    user_td = []
    user_ld = []
    user_loc = []
    user_dst = []

    for i, line in enumerate(lines):
        tokens = line.strip().split('\t')
        if len(tokens) < 3:
            if user_td: 
                train_user.append(user)
                train_td.append(user_td)
                train_ld.append(user_ld)
                train_loc.append(user_loc)
                train_dst.append(user_dst)
            user = int(tokens[0])
            user_td = []
            user_ld = []
            user_loc = []
            user_dst = []
            continue
        td = np.array([float(t) for t in tokens[0].split(',')])
        ld = np.array([float(t) for t in tokens[1].split(',')])
        loc = np.array([int(t) for t in tokens[2].split(',')])
        dst = int(tokens[3])
        user_td.append(td)
        user_ld.append(ld)
        user_loc.append(loc)
        user_dst.append(dst)

    if user_td: 
        train_user.append(user)
        train_td.append(user_td)
        train_ld.append(user_ld)
        train_loc.append(user_loc)
        train_dst.append(user_dst)

    return train_user, train_td, train_ld, train_loc, train_dst

In [None]:
prev_user = int(lines[0].split('\t')[0])
for i, line in enumerate(lines):
    tokens = line.strip().split('\t')
    user = user2id.get(int(tokens[0]))
    # The next line is to get rid of the users
    # who have less than 30 records
    if user is None:
        continue
    
    # Now, we will only deal with users with more than 30 records.
    time = (datetime.strptime(tokens[1], "%Y-%m-%d") - datetime(1970, 1, 1)).days
    lat, lon, location = tokens[2], tokens[3], tokens[4]
    
    # The next line creates the poi2id dictionary.
    # It maps the existing location id to a new id.
    # The new id is defined upon the order of the appearance.
    # You can view it as simply rename location id.
    if poi2id.get(location) is None:
        poi2id[location] = len(poi2id)
    loc = poi2id.get(location)
    
    # When the user is the previous one,
    # Just add his attributes into lists accordingly.
    # Note that our file is ordered by user id.
    # So once a 
    if user == prev_user:
        user_time.insert(0, time)
        user_lat.insert(0, lat)
        user_lon.insert(0, lon)
        user_loc.insert(0, loc)
    # We will update train / valid / test lists nnce new user appears.
    # To illustrate, each element in train_time is a list containing
    # the first 70% time record of a user. Others are similarly defined.
    else:
        train_threshold = int(len(user_time) * 0.7)
        valid_threshold = int(len(user_time) * 0.8)
            
        train_user.append(user)
        train_time.append(user_time[:train_threshold])
        train_lat.append(user_lat[:train_threshold])
        train_lon.append(user_lon[:train_threshold])
        train_loc.append(user_loc[:train_threshold])
            
        valid_user.append(user)
        valid_time.append(user_time[train_threshold:valid_threshold])
        valid_lat.append(user_lat[train_threshold:valid_threshold])
        valid_lon.append(user_lon[train_threshold:valid_threshold])
        valid_loc.append(user_loc[train_threshold:valid_threshold])
            
        test_user.append(user)
        test_time.append(user_time[valid_threshold:])
        test_lat.append(user_lat[valid_threshold:])
        test_lon.append(user_lon[valid_threshold:])
        test_loc.append(user_loc[valid_threshold:])

        prev_user = user
        user_time = [time]
        user_lat = [lat]
        user_lon = [lon]
        user_loc = [loc]      

if user2id.get(user) is not None:
    train_threshold = int(len(user_time) * 0.7)
    valid_threshold = int(len(user_time) * 0.8)
            
    train_user.append(user)
    train_time.append(user_time[:train_threshold])
    train_lat.append(user_lat[:train_threshold])
    train_lon.append(user_lon[:train_threshold])
    train_loc.append(user_loc[:train_threshold])
            
    valid_user.append(user)
    valid_time.append(user_time[train_threshold:valid_threshold])
    valid_lat.append(user_lat[train_threshold:valid_threshold])
    valid_lon.append(user_lon[train_threshold:valid_threshold])
    valid_loc.append(user_loc[train_threshold:valid_threshold])
            
    test_user.append(user)
    test_time.append(user_time[valid_threshold:])
    test_lat.append(user_lat[valid_threshold:])
    test_lon.append(user_lon[valid_threshold:])
    test_loc.append(user_loc[valid_threshold:])

kk = train_time[:]