In [1]:
import pandas as pd
import numpy as np
import scipy.sparse

def itemize(data):
    _,indx = np.unique(data,return_index=True)
    u = data[np.sort(indx)]
    n_data = u.shape[0]
    new_indx = np.arange(n_data)
    d = dict(zip(u,new_indx))
    data_indx = np.zeros(data.shape, dtype=np.int32)
    for i in range(data_indx.shape[0]):
        data_indx[i] = d[data[i]]
    return data_indx, n_data

def load_data():
    data = pd.read_csv('review.csv')
    data = data.drop(['funny', 'review_id', 'text', 'date', 'useful', 'cool'], axis=1)
    data.drop_duplicates(inplace=True)
    rows, cols, stars = np.array(data['user_id']), np.array(data['business_id']), np.array(data['stars'],dtype=np.uint8)
    # itemize users and items
    row_indx, n_users = itemize(rows)
    col_indx, n_items = itemize(cols)
    return scipy.sparse.csr_matrix((stars,(row_indx, col_indx)), dtype=np.uint8, shape=(n_users,n_items))


R = load_data()

In [16]:
tr_list = R.nonzero()
No_entries = len(tr_list[0])
print (No_entries)

5261667


In [17]:
R.shape  # (users, businesses) 

(1326101, 174567)

In [18]:
import random
import copy

In [19]:
cold_start_users = random.sample(range(R.shape[0]),int(R.shape[0]*0.025))
cold_start_items = random.sample(range(R.shape[1]),int(R.shape[1]*0.025))

In [20]:
cold_start_users = set(cold_start_users)
cold_start_items = set(cold_start_items)

In [21]:
R_copy = copy.deepcopy(R)

In [22]:
cold_start_user_triplet = []
cold_start_item_triplet = []

In [23]:
print (len(tr_list[0]))
total = len(tr_list[0])

5261667


In [24]:
print (len(cold_start_users),len(cold_start_items))

33152 4364


In [25]:
count = 0
perc = 0
for i in zip(tr_list[0],tr_list[1]):
    if count%500000==1:
        print (perc)
        perc+=1
    count+=1
    if i[0] in cold_start_users:
        cold_start_user_triplet.append(i)
        R_copy[i[0],i[1]] = 0
    if i[1] in cold_start_items:
        cold_start_item_triplet.append(i)
        R_copy[i[0],i[1]] = 0

0
1
2
3
4
5
6
7
8
9
10


In [26]:
train_warmTest_TMP = R_copy.nonzero()
print (len(train_warmTest_TMP[0]))
train_warmTest = [(i[0],i[1]) for i in zip(train_warmTest_TMP[0],train_warmTest_TMP[1])]
warm_test = random.sample(train_warmTest,200000)
for i in warm_test:
    R_copy[i[0],i[1]] = 0

train = R_copy.nonzero()

train = [(i[0],i[1]) for i in zip(train[0],train[1])]

print (len(train))

5005527
4805527


In [27]:
print ('train',len(train))
print ('warm_test',len(warm_test))
print ('cold_item',len(cold_start_item_triplet))
print ('cold_user',len(cold_start_user_triplet))

train 4805527
warm_test 200000
cold_item 132384
cold_user 127064


In [37]:
with open('test_cold_item.csv','w') as f:
    for i in cold_start_item_triplet:
        f.write(   str(i[0])+','+str(i[1])+','+str(R[i[0],i[1]])+ ','+ str(random.randint(1,400000))+'\n')

In [38]:
with open('test_cold_user.csv','w') as f:
    for i in cold_start_user_triplet:
        f.write(   str(i[0])+','+str(i[1])+','+str(R[i[0],i[1]])+ ','+ str(random.randint(1,400000))+'\n')

In [39]:
with open('test_warm.csv','w') as f:
    for i in warm_test:
        f.write(   str(i[0])+','+str(i[1])+','+str(R[i[0],i[1]])+ ','+ str(random.randint(1,400000))+'\n')

In [40]:
with open('train.csv','w') as f:
    for i in train:
        f.write(   str(i[0])+','+str(i[1])+','+str(R[i[0],i[1]])+ ','+ str(random.randint(1,400000))+'\n')

In [42]:
with open('test_cold_item_item_ids.csv','w') as f:
    for i in cold_start_item_triplet:
        f.write(str(i[1])+'\n')
        
with open('test_cold_user_item_ids.csv','w') as f:
    for i in cold_start_user_triplet:
        f.write(str(i[1])+'\n')
        
with open('test_warm_item_ids.csv','w') as f:
    for i in warm_test:
        f.write(str(i[1])+'\n')