In [1]:
import pandas as pd
# Reload the CSV file using tab ('\t') as the delimiter
data = pd.read_csv('./NineRec/Downstream_datasets/DY/5_core/dy-indexed-v4.inter', delimiter='\t')

# Display the first few rows of the DataFrame to confirm it's loaded correctly
data.head()


Unnamed: 0,userID,itemID,timestamp,x_label
0,0,3,1604569824,0
1,0,5,1637984651,0
2,0,0,1609025879,0
3,0,10,1642899172,0
4,0,17,1600728484,0


In [3]:
# Filter the training data where x_label is 0 or 1
training_data = data[data['x_label'].isin([0, 1])]

# Calculate the item popularity for items in the training set
item_popularity = training_data['itemID'].value_counts().reset_index()
item_popularity.columns = ['itemID', 'popularity']

# Sort the item popularity data by itemID
item_popularity_sorted = item_popularity.sort_values(by='itemID')

# Display the sorted item popularity data
item_popularity_sorted.head()


Unnamed: 0,itemID,popularity
471,0,41
197,1,54
73,2,68
128,3,61
718,4,35


In [5]:
item_popularity_sorted.to_csv('./NineRec/Downstream_datasets/DY/5_core/item_popularity.txt', sep=' ', index=False, header=False)

In [7]:
train_data = data[data['x_label'].isin([0, 1])]
test_data = data[data['x_label'] == 2]


# For training data with all values as integers
train_final_numbers = train_data.groupby('userID')['itemID'].apply(lambda x: ' '.join(map(str, map(int, x)))).reset_index()
train_final_numbers['formatted'] = train_final_numbers['userID'].astype(int).astype(str) + ' ' + train_final_numbers['itemID']
train_final_numbers['formatted'].to_csv('./NineRec/Downstream_datasets/DY/5_core/train.txt', header=False, index=False)

# For testing data with all values as integers
test_final_numbers = test_data.groupby('userID')['itemID'].apply(lambda x: ' '.join(map(str, map(int, x)))).reset_index()
test_final_numbers['formatted'] = test_final_numbers['userID'].astype(int).astype(str) + ' ' + test_final_numbers['itemID']
test_final_numbers['formatted'].to_csv('./NineRec/Downstream_datasets/DY/5_core/test.txt', header=False, index=False)


In [9]:
import numpy as np

# Load the .npy file
npy_data = np.load('./NineRec/Downstream_datasets/DY/5_core/text_feat-v1.npy')

npy_data

array([[ 0.01839369, -0.01914624,  0.13961856, ..., -0.09179249,
         0.03171114,  0.02249038],
       [-0.07890213,  0.06225019,  0.08174762, ...,  0.02497742,
         0.04100604, -0.00232789],
       [ 0.02713829, -0.0923062 ,  0.00141066, ..., -0.02901872,
        -0.03519771, -0.05361785],
       ...,
       [ 0.02033167, -0.04825168,  0.08120707, ..., -0.0033165 ,
         0.08071791, -0.04572536],
       [-0.07413378, -0.02312653, -0.00743248, ..., -0.01061078,
         0.00136186,  0.1098616 ],
       [-0.02813684,  0.00770879,  0.10589706, ..., -0.05555856,
         0.03983437, -0.00621293]], dtype=float32)

In [11]:

format_str = '%d ' + ' '.join(['%f'] * npy_data.shape[1])

# Prepare the data with indices
indexed_data = np.hstack((np.arange(npy_data.shape[0]).reshape(-1, 1), npy_data))

# Save to text file
np.savetxt('./NineRec/Downstream_datasets/DY/5_core/itemtitle2vec.txt', indexed_data, fmt=format_str)


In [13]:
# Load the .npy file
npy_data = np.load('./NineRec/Downstream_datasets/DY/5_core/image_feat.npy')

npy_data

format_str = '%d ' + ' '.join(['%f'] * npy_data.shape[1])

# Prepare the data with indices
indexed_data = np.hstack((np.arange(npy_data.shape[0]).reshape(-1, 1), npy_data))

# Save to text file
np.savetxt('./NineRec/Downstream_datasets/DY/5_core/item2imgfeat.txt', indexed_data, fmt=format_str)
