In [5]:
import pandas as pd
# Reload the CSV file using tab ('\t') as the delimiter
data = pd.read_csv('../Data/sports/sports.inter', delimiter='\t')

# Display the first few rows of the DataFrame to confirm it's loaded correctly
data.head()


Unnamed: 0,userID,itemID,rating,timestamp,x_label
0,0,11981,2.0,1390694400,0
1,0,15852,5.0,1390694400,0
2,0,17787,3.0,1391990400,0
3,0,0,5.0,1390694400,0
4,0,3369,5.0,1405123200,0


In [6]:
# Filter the training data where x_label is 0 or 1
training_data = data[data['x_label'].isin([0, 1])]

# Calculate the item popularity for items in the training set
item_popularity = training_data['itemID'].value_counts().reset_index()
item_popularity.columns = ['itemID', 'popularity']

# Sort the item popularity data by itemID
item_popularity_sorted = item_popularity.sort_values(by='itemID')

# Display the sorted item popularity data
item_popularity_sorted.head()


Unnamed: 0,itemID,popularity
11062,0,6
6784,1,10
17072,2,4
208,3,95
4659,4,14


In [7]:
item_popularity_sorted.to_csv('../Data/sports/item_popularity.txt', sep=' ', index=False, header=False)

In [8]:
train_data = data[data['x_label'].isin([0, 1])]
test_data = data[data['x_label'] == 2]


# For training data with all values as integers
train_final_numbers = train_data.groupby('userID')['itemID'].apply(lambda x: ' '.join(map(str, map(int, x)))).reset_index()
train_final_numbers['formatted'] = train_final_numbers['userID'].astype(int).astype(str) + ' ' + train_final_numbers['itemID']
train_final_numbers['formatted'].to_csv('../Data/sports/train.txt', header=False, index=False)

# For testing data with all values as integers
test_final_numbers = test_data.groupby('userID')['itemID'].apply(lambda x: ' '.join(map(str, map(int, x)))).reset_index()
test_final_numbers['formatted'] = test_final_numbers['userID'].astype(int).astype(str) + ' ' + test_final_numbers['itemID']
test_final_numbers['formatted'].to_csv('../Data/sports/test.txt', header=False, index=False)


In [9]:
import numpy as np

# Load the .npy file
npy_data = np.load('../Data/sports/text_feat.npy')

npy_data.shape

(18357, 384)

In [10]:

format_str = '%d ' + ' '.join(['%f'] * npy_data.shape[1])

# Prepare the data with indices
indexed_data = np.hstack((np.arange(npy_data.shape[0]).reshape(-1, 1), npy_data))

# Save to text file
np.savetxt('../Data/sports/itemtitle2vec.txt', indexed_data, fmt=format_str)


In [11]:
# Load the .npy file
npy_data = np.load('../Data/sports/image_feat.npy')

npy_data

format_str = '%d ' + ' '.join(['%f'] * npy_data.shape[1])

# Prepare the data with indices
indexed_data = np.hstack((np.arange(npy_data.shape[0]).reshape(-1, 1), npy_data))

# Save to text file
np.savetxt('../Data/sports/item2imgfeat.txt', indexed_data, fmt=format_str)
