In [1]:
import pandas as pd 

df = pd.read_csv(
    'data/sports/Sports_and_Outdoors.csv',
    header=None, 
    names=['item', 'user', 'rating', 'timestamp']
)

In [2]:
df

Unnamed: 0,item,user,rating,timestamp
0,0000031895,A23K73OVXJ04EG,5.0,1391212800
1,0000031895,A2681T699HV6H1,4.0,1384905600
2,0000031895,A374PA18DCGS5Y,1.0,1477008000
3,0000031895,A14PVW2N5YBWSA,5.0,1476748800
4,0000031895,A2KWBC44QI2567,1.0,1476662400
...,...,...,...,...
12980832,B01HJHHBHG,A33DFHRKGPDEF7,3.0,1502323200
12980833,B01HJHHBHG,AT5N4QPWM1GKL,4.0,1501372800
12980834,B01HJHHBHG,A3ONWSRNZFNC3U,5.0,1497139200
12980835,B01HJHHBHG,A1OJNTT9ZTT82A,5.0,1496966400


In [3]:
# Função para verificar se um valor é um inteiro
def is_int(val):
    try:
        int(val)
        return True
    except ValueError:
        return False

# Filtrar o DataFrame
df_filtered = df[df['item'].apply(is_int)]

# Redefinir o índice
df_filtered.reset_index(drop=True, inplace=True)

In [12]:
df_filtered

Unnamed: 0,item,user,rating,timestamp
0,0000031895,A23K73OVXJ04EG,5.0,1391212800
1,0000031895,A2681T699HV6H1,4.0,1384905600
2,0000031895,A374PA18DCGS5Y,1.0,1477008000
3,0000031895,A14PVW2N5YBWSA,5.0,1476748800
4,0000031895,A2KWBC44QI2567,1.0,1476662400
...,...,...,...,...
9055,9879000889,A10HEEVEWGHP6V,2.0,1508457600
9056,0974092304,A2N9SGIVS2MC8B,5.0,1436054400
9057,7301034644,A10U780E3MRD4I,5.0,1424044800
9058,6042354264,A3ONG3PH0ACD6U,4.0,1530576000


In [13]:
from sklearn.model_selection import train_test_split

# Dividir em treino e teste
train_df, test_df = train_test_split(df_filtered, test_size=0.2, random_state=42)

# Dividir o conjunto de treino em treino e validação
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42) # 0.25 * 0.8 = 0.2


In [14]:
train_df

Unnamed: 0,item,user,rating,timestamp
2175,7245456313,A21968E2Y013LU,5.0,1491350400
7919,7245456313,A2SC45J3OZ7Q3Z,5.0,1311811200
4737,7245456313,ACBO76L6K8I9J,5.0,1434240000
6844,7245456313,A1Q87LJM01V03J,5.0,1381968000
4341,7245456313,ARY9PPQHOROAC,5.0,1442966400
...,...,...,...,...
4305,7245456313,ASVKYLOZ1O9ZG,4.0,1443916800
968,7245456275,A2NG0LCHBEAY63,4.0,1461196800
5724,7245456313,AX2ZX35D0Y3M4,5.0,1416787200
2270,7245456313,A1A5L7HMB7TPB,5.0,1489449600


In [15]:
test_df

Unnamed: 0,item,user,rating,timestamp
5634,7245456313,A137WYL3312W1W,5.0,1419292800
1726,7245456313,AXWPN0I6YGRQW,5.0,1508371200
4404,7245456313,A2SB1T4GO4P2FP,5.0,1441584000
5498,7245456313,A2QGB09310SLD8,5.0,1421712000
5133,7245456313,ACWATJN494ICW,5.0,1427155200
...,...,...,...,...
6048,7245456313,A1Y6HECHC2EAJ1,5.0,1406332800
2215,7245456313,ART1ONF2755PR,5.0,1490572800
1775,7245456313,A14X8WDJYPMBIF,5.0,1504828800
1166,7245456275,A357ZWR87NF25G,5.0,1429920000


In [16]:
val_df

Unnamed: 0,item,user,rating,timestamp
2240,7245456313,A7YG7B7L6PATG,1.0,1489968000
1801,7245456313,A1E4F6E2L4HABA,1.0,1503360000
5571,7245456313,A30XP6ZNCJAMOO,5.0,1420588800
1959,7245456313,A3PYSZAE8L8SQ8,5.0,1497484800
4268,7245456313,AP4WAIEHRH18M,5.0,1444953600
...,...,...,...,...
3588,7245456313,A3OC3Z55JOK9DQ,5.0,1461024000
2773,7245456313,AT6Y66YEVTODT,5.0,1483142400
3884,7245456313,A1V7EHON9QLOLL,5.0,1453852800
8898,9879000706,A3643LMQFZZHV1,4.0,1424131200


In [17]:
train_df.to_csv('./data/sports/train.txt', header=False, index=False, sep=' ')
test_df.to_csv('./data/sports/test.txt', header=False, index=False, sep=' ')
val_df.to_csv('./data/sports/val.txt', header=False, index=False, sep=' ')


In [17]:
def save_to_file(df, filename):
    grouped = df.groupby('user')['item'].apply(list)
    with open(filename, 'w') as f:
        for user, items in grouped.items():
            items_str = ' '.join(map(str, items))
            f.write(f"{user} {items_str}\n")

save_to_file(train_df, './data/sports/train.txt')
save_to_file(test_df, './data/sports/test.txt')
save_to_file(val_df, './data/sports/val.txt')

In [10]:
# Supondo que df seja seu DataFrame original
unique_items = df_filtered['item'].unique()
item_to_int = {item: i for i, item in enumerate(unique_items)}

# Aplica o mapeamento aos DataFrames
train_df['item'] = train_df['item'].map(item_to_int)
test_df['item'] = test_df['item'].map(item_to_int)
val_df['item'] = val_df['item'].map(item_to_int)

# Agora você pode salvar os DataFrames como antes


In [11]:
train_df

Unnamed: 0,item,user,rating,timestamp
2175,,A21968E2Y013LU,5.0,1491350400
7919,,A2SC45J3OZ7Q3Z,5.0,1311811200
4737,,ACBO76L6K8I9J,5.0,1434240000
6844,,A1Q87LJM01V03J,5.0,1381968000
4341,,ARY9PPQHOROAC,5.0,1442966400
...,...,...,...,...
4305,,ASVKYLOZ1O9ZG,4.0,1443916800
968,,A2NG0LCHBEAY63,4.0,1461196800
5724,,AX2ZX35D0Y3M4,5.0,1416787200
2270,,A1A5L7HMB7TPB,5.0,1489449600


In [9]:
def save_to_file(df, filename):
    grouped = df.groupby('user')['item'].apply(list)
    with open(filename, 'w') as f:
        for user, items in grouped.items():
            items_str = ' '.join(map(str, items))
            f.write(f"{user} {items_str}\n")

save_to_file(train_df, './data/sports/train.txt')
save_to_file(test_df, './data/sports/test.txt')
save_to_file(val_df, './data/sports/val.txt')

In [3]:
import pandas as pd

from sklearn.model_selection import train_test_split

# Dividir em treino e teste
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Dividir o conjunto de treino em treino e validação
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42) # 0.25 * 0.8 = 0.2

# Suponha que df seja o seu DataFrame existente
# df = pd.read_csv('seu_arquivo.csv')




In [4]:

# Criar a matriz de interação usuário-item
train_interaction = train_df.pivot(index='user', columns='item', values='rating')

# Substituir NaN por 0, se necessário
train_interaction = train_interaction.fillna(0)

print(train_interaction)

  train_interaction = train_df.pivot(index='user', columns='item', values='rating')


In [None]:

# Criar a matriz de interação usuário-item
test_interaction = test_df.pivot(index='user', columns='item', values='rating')

# Substituir NaN por 0, se necessário
test_interaction = test_interaction.fillna(0)

print(test_interaction)

In [None]:

# Criar a matriz de interação usuário-item
val_interaction = val_df.pivot(index='user', columns='item', values='rating')

# Substituir NaN por 0, se necessário
val_interaction = val_interaction.fillna(0)

print(val_interaction)

In [None]:
save_to_file(train_df, './data/sports/train.txt')
save_to_file(test_df, './data/sports/test.txt')
save_to_file(val_df, './data/sports/val.txt')