In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load and check the dataC:\Users\matte\Tsinghua\BigD\Project\data_format1\data_format1\test_format1.csv

user_logs = pd.read_csv('data_format1/data_format1/user_log_format1.csv')

print(user_logs.head(5))

   user_id  item_id  cat_id  seller_id  brand_id  time_stamp  action_type
0   328862   323294     833       2882    2661.0         829            0
1   328862   844400    1271       2882    2661.0         829            0
2   328862   575153    1271       2882    2661.0         829            0
3   328862   996875    1271       2882    2661.0         829            0
4   328862  1086186    1271       1253    1049.0         829            0


TFIuser_logs encoding
Itfiuser_logs encodes the cat brand seller item of all user log, retains the topN according to the frequency of occurrence, and avoids too high a dimension
Itfiuser_logs encoding is performed on all cat brand items that are interacted with in the store, and the topN items are retained based on the frequency of occurrence to avoid excessive dimensionality.
Itfiuser_logs encoding is performed on the cat brand items that match users and stores in the sample, and the topN items are retained based on the frequency of occurrence to avoid excessive dimensionality.

In [3]:
#data preparation
user_logs['user_id'] = user_logs['user_id'].astype(np.int32)
user_logs['item_id'] = user_logs['item_id'].astype(np.int32)
user_logs['cat_id'] = user_logs['cat_id'].astype(np.int16)
user_logs['seller_id'] = user_logs['seller_id'].astype(np.int16)
user_logs.rename(columns={'seller_id' : 'merchant_id'}, inplace=True)
user_logs['brand_id'].fillna(0, inplace=True)
user_logs['brand_id'] = user_logs['brand_id'].astype(np.int16)
user_logs['time_stamp'] = (pd.to_datetime(user_logs['time_stamp'], format='%m%d') - pd.to_datetime(user_logs['time_stamp'].min(), format='%m%d')).dt.days
user_logs['time_stamp'] = user_logs['time_stamp'].astype(np.int16)
user_logs['action_type'] = user_logs['action_type'].astype(np.int8)

In [4]:
print(len(user_logs['brand_id'].unique()))
print(len(user_logs['cat_id'].unique()))
print(len(user_logs['item_id'].unique()))
print(len(user_logs['user_id'].unique()))
print(len(user_logs['merchant_id'].unique()))

8444
1658
1090390
424170
4995


# User

## User-cat

In [21]:
import pandas as pd
import numpy as np
from math import log

user_cat_df = user_logs.copy()
user_cat_df.drop(columns=['action_type', 'time_stamp','merchant_id', 'item_id', 'brand_id'], inplace=True)
# Step 1: Count occurrences of cat_id for each user_id (Numerator of TF)
user_cat_df['cat_count'] = user_cat_df.groupby(['user_id', 'cat_id'])['cat_id'].transform('count')
# Step 2: Calculate total interactions for each user_id (Denominator of TF)
user_cat_df['total_interactions'] = user_cat_df.groupby('user_id')['cat_id'].transform('count')
# Step 3: Calculate TF (Term Frequency)
user_cat_df['tf'] = user_cat_df['cat_count'] / user_cat_df['total_interactions']
user_cat_df = user_cat_df.drop_duplicates(subset=['user_id', 'cat_id'])

# Step 4: Calculate the document frequency (df) for each category (cat_id)
df_cat_df = user_cat_df.groupby('cat_id')['user_id'].nunique().reset_index(name='df')

# Step 5: Calculate IDF (Inverse Document Frequency)
N = user_cat_df['user_id'].nunique()  # Total number of unique users
df_cat_df['idf'] = df_cat_df['df'].apply(lambda x: log(N / (x + 1)))  # IDF formula

# Step 6: Merge IDF back into the original DataFrame
user_cat_df = user_cat_df.merge(df_cat_df[['cat_id', 'idf']], on='cat_id', how='left')

# Step 7: Calculate TF-IDF
user_cat_df['tfidf'] = user_cat_df['tf'] * user_cat_df['idf']

user_cat_df.drop(columns=['cat_count', 'total_interactions', 'tf','idf'], inplace=True)
print(user_cat_df.head(5))

# Set the number of top tfidf values you want per user_id
# Group by user_id and keep the top n TF-IDF values for each user
n= 5
grouped = (
    user_cat_df.groupby('user_id')['tfidf']
    .apply(lambda x: sorted(x, reverse=True)[:n])  # Sort and select top n
    .reset_index(name=f'top_{n}_tfidf')  # Reset index and rename column
)

# Save the output if needed
grouped.to_csv(f'cat_user.csv', index=False)  # Optional: save to a file

# Display the result
print(grouped)

   user_id  cat_id     tfidf
0   328862     833  0.031020
1   328862    1271  0.098857
2   328862    1467  0.030949
3   328862    1095  0.029926
4   328862     602  0.040811
        user_id                                        top_5_tfidf
0             1  [0.8335644548432335, 0.4706547355921492, 0.254...
1             2  [0.37582408192796374, 0.2515851185852035, 0.18...
2             3  [0.6968686505976962, 0.37253301987480675, 0.22...
3             4  [0.3793766233040169, 0.26735187470922145, 0.16...
4             5  [0.27725803294739876, 0.20750181766894002, 0.2...
...         ...                                                ...
424165   424166  [0.6535020467491037, 0.38608635898093446, 0.24...
424166   424167  [0.6398008541301398, 0.35542274500125837, 0.24...
424167   424168  [0.42830890690239937, 0.3201519809124793, 0.12...
424168   424169  [0.38216923438358263, 0.20247118114713147, 0.1...
424169   424170  [0.5933784144461842, 0.431412990270693, 0.2301...

[424170 rows x 2 colu

## User-brand

In [22]:
import pandas as pd
import numpy as np
from math import log

# Step 1: Modify to use brand_id
user_brand_df = user_logs.copy()
user_brand_df.drop(columns=['action_type', 'time_stamp', 'merchant_id', 'item_id', 'cat_id'], inplace=True)

# Step 2: Count occurrences of brand_id for each user_id (Numerator of TF)
user_brand_df['brand_count'] = user_brand_df.groupby(['user_id', 'brand_id'])['brand_id'].transform('count')

# Step 3: Calculate total interactions for each user_id (Denominator of TF)
user_brand_df['total_interactions'] = user_brand_df.groupby('user_id')['brand_id'].transform('count')

# Step 4: Calculate TF (Term Frequency)
user_brand_df['tf'] = user_brand_df['brand_count'] / user_brand_df['total_interactions']
user_brand_df = user_brand_df.drop_duplicates(subset=['user_id', 'brand_id'])
print("tf > 1")
print(user_brand_df[user_brand_df['tf'] > 1])
print("="*70)
# Step 5: Calculate the document frequency (df) for each brand_id
df_brand_df = user_brand_df.groupby('brand_id')['user_id'].nunique().reset_index(name='df')

# Step 6: Calculate IDF (Inverse Document Frequency)
N = user_brand_df['user_id'].nunique()  # Total number of unique users
df_brand_df['idf'] = df_brand_df['df'].apply(lambda x: log(N / (x + 1)))  # IDF formula

print("idf > 1")
print(df_brand_df[df_brand_df['idf'] > 1])
# Step 7: Merge IDF back into the original DataFrame
user_brand_df = user_brand_df.merge(df_brand_df[['brand_id', 'idf']], on='brand_id', how='left')

# Step 8: Calculate TF-IDF
user_brand_df['tfidf'] = user_brand_df['tf'] * user_brand_df['idf']

# Clean up unnecessary columns
user_brand_df.drop(columns=['brand_count', 'total_interactions', 'tf', 'idf'], inplace=True)

# Display the first few rows of the transformed DataFrame
print(user_brand_df.head(5))

# Set the number of top tfidf values you want per user_id
n = 10  # Adjust this value as needed

# Group by user_id and keep the top n TF-IDF values for each user
grouped = (
    user_brand_df.groupby('user_id')['tfidf']
    .apply(lambda x: sorted(x, reverse=True)[:n])  # Sort and select top n
    .reset_index(name=f'top_{n}_tfidf')  # Reset index and rename column
)

# Save the output if needed
grouped.to_csv(f'brand_user.csv', index=False)  # Optional: save to a file

# Display the result
print(grouped)


tf > 1
Empty DataFrame
Columns: [user_id, brand_id, brand_count, total_interactions, tf]
Index: []
idf > 1
      brand_id     df        idf
0            0  40579   2.346859
1            1   1482   5.656067
2            2      3  11.571595
3            3      6  11.011979
4            4      1  12.264742
...        ...    ...        ...
8439      8473   8588   3.899652
8440      8474      7  10.878448
8441      8475     76   8.614084
8442      8476   5682   4.312655
8443      8477    165   7.845902

[8444 rows x 3 columns]
   user_id  brand_id     tfidf
0   328862      2661  0.346613
1   328862      1049  0.011485
2   328862      1647  0.063780
3   328862      4953  0.092827
4   328862      7622  0.118209
        user_id                                       top_10_tfidf
0             1  [2.1093608797482153, 0.8532460580929809, 0.612...
1             2  [1.2985146869992235, 0.9498085204338002, 0.300...
2             3  [0.82840763025819, 0.7725655555138775, 0.36971...
3             4  [

## User-Item

In [23]:
import pandas as pd
import numpy as np
from math import log

# Step 1: Modify to use item_id
user_item_df = user_logs.copy()
user_item_df.drop(columns=['action_type', 'time_stamp', 'merchant_id', 'brand_id', 'cat_id'], inplace=True)

# Step 2: Count occurrences of item_id for each user_id (Numerator of TF)
user_item_df['item_count'] = user_item_df.groupby(['user_id', 'item_id'])['item_id'].transform('count')

# Step 3: Calculate total interactions for each user_id (Denominator of TF)
user_item_df['total_interactions'] = user_item_df.groupby('user_id')['item_id'].transform('count')

# Step 4: Calculate TF (Term Frequency)
user_item_df['tf'] = user_item_df['item_count'] / user_item_df['total_interactions']
user_item_df = user_item_df.drop_duplicates(subset=['user_id', 'item_id'])
print("tf > 1")
print(user_item_df[user_item_df['tf'] > 1])
print("="*70)

# Step 5: Calculate the document frequency (df) for each item_id
df_item_df = user_item_df.groupby('item_id')['user_id'].nunique().reset_index(name='df')

# Step 6: Calculate IDF (Inverse Document Frequency)
N = user_item_df['user_id'].nunique()  # Total number of unique users
df_item_df['idf'] = df_item_df['df'].apply(lambda x: log(N / (x + 1)))  # IDF formula

print("idf > 1")
print(df_item_df[df_item_df['idf'] > 1])

# Step 7: Merge IDF back into the original DataFrame
user_item_df = user_item_df.merge(df_item_df[['item_id', 'idf']], on='item_id', how='left')

# Step 8: Calculate TF-IDF
user_item_df['tfidf'] = user_item_df['tf'] * user_item_df['idf']

# Clean up unnecessary columns
user_item_df.drop(columns=['item_count', 'total_interactions', 'tf', 'idf'], inplace=True)

# Display the first few rows of the transformed DataFrame
print(user_item_df.head(5))

# Set the number of top tfidf values you want per user_id
n = 20  # Adjust this value as needed

# Group by user_id and keep the top n TF-IDF values for each user
grouped = (
    user_item_df.groupby('user_id')['tfidf']
    .apply(lambda x: sorted(x, reverse=True)[:n])  # Sort and select top n
    .reset_index(name=f'top_{n}_tfidf')  # Reset index and rename column
)

# Save the output if needed
grouped.to_csv(f'item_user.csv', index=False)  # Optional: save to a file

# Display the result
print(grouped)


tf > 1
Empty DataFrame
Columns: [user_id, item_id, item_count, total_interactions, tf]
Index: []
idf > 1
         item_id    df        idf
0              1     1  12.264742
1              2  1277   5.804838
2              3    58   8.880352
3              4    21   9.866847
4              5     3  11.571595
...          ...   ...        ...
1090385  1113162    50   9.026064
1090386  1113163    11  10.472983
1090387  1113164     1  12.264742
1090388  1113165     1  12.264742
1090389  1113166    35   9.374371

[1090390 rows x 3 columns]
   user_id  item_id     tfidf
0   328862   323294  0.025430
1   328862   844400  0.027617
2   328862   575153  0.102727
3   328862   996875  0.027085
4   328862  1086186  0.020590
        user_id                                       top_20_tfidf
0             1  [2.938813961148432, 0.8082934134510749, 0.6839...
1             2  [0.48537673766426714, 0.4782818780807995, 0.41...
2             3  [0.6125058033137332, 0.44935150919180367, 0.38...
3          

## User-Seller

In [28]:
import pandas as pd
import numpy as np
from math import log

# Step 1: Modify to use merchant_id
user_merchant_df = user_logs.copy()
user_merchant_df.drop(columns=['action_type', 'time_stamp', 'brand_id', 'cat_id', 'item_id'], inplace=True)

# Step 2: Count occurrences of merchant_id for each user_id (Numerator of TF)
user_merchant_df['merchant_count'] = user_merchant_df.groupby(['user_id', 'merchant_id'])['merchant_id'].transform('count')

# Step 3: Calculate total interactions for each user_id (Denominator of TF)
user_merchant_df['total_interactions'] = user_merchant_df.groupby('user_id')['merchant_id'].transform('count')

# Step 4: Calculate TF (Term Frequency)
user_merchant_df['tf'] = user_merchant_df['merchant_count'] / user_merchant_df['total_interactions']
user_merchant_df = user_merchant_df.drop_duplicates(subset=['user_id', 'merchant_id'])
print("tf > 1")
print(user_merchant_df[user_merchant_df['tf'] > 1])
print("="*70)

# Step 5: Calculate the document frequency (df) for each merchant_id
df_merchant_df = user_merchant_df.groupby('merchant_id')['user_id'].nunique().reset_index(name='df')

# Step 6: Calculate IDF (Inverse Document Frequency)
N = user_merchant_df['user_id'].nunique()  # Total number of unique users
df_merchant_df['idf'] = df_merchant_df['df'].apply(lambda x: log(N / (x + 1)))  # IDF formula

print("idf > 1")
print(df_merchant_df[df_merchant_df['idf'] > 1])

# Step 7: Merge IDF back into the original DataFrame
user_merchant_df = user_merchant_df.merge(df_merchant_df[['merchant_id', 'idf']], on='merchant_id', how='left')

# Step 8: Calculate TF-IDF
user_merchant_df['tfidf'] = user_merchant_df['tf'] * user_merchant_df['idf']

# Clean up unnecessary columns
user_merchant_df.drop(columns=['merchant_count', 'total_interactions', 'tf', 'idf'], inplace=True)

# Display the first few rows of the transformed DataFrame
print(user_merchant_df.head(5))

# Set the number of top tfidf values you want per user_id
n = 5  # Adjust this value as needed

# Group by user_id and keep the top n TF-IDF values for each user
grouped = (
    user_merchant_df.groupby('user_id')['tfidf']
    .apply(lambda x: sorted(x, reverse=True)[:n])  # Sort and select top n
    .reset_index(name=f'top_{n}_tfidf')  # Reset index and rename column
)

# Save the output if needed
grouped.to_csv(f'merchant_user.csv', index=False)  # Optional: save to a file

# Display the result
print(grouped)


tf > 1
Empty DataFrame
Columns: [user_id, merchant_id, merchant_count, total_interactions, tf]
Index: []
idf > 1
      merchant_id     df       idf
0               1  30796  2.622717
1               2    936  6.115206
2               3   1136  5.921741
3               4   1481  5.656742
4               5   3652  4.754586
...           ...    ...       ...
4990         4991    227  7.528544
4991         4992   2570  5.105839
4992         4993   3986  4.667095
4993         4994   2736  5.043272
4994         4995   3017  4.945540

[4995 rows x 3 columns]
   user_id  merchant_id     tfidf
0   328862         2882  0.346839
1   328862         1253  0.034181
2   328862          883  0.063774
3   328862          420  0.085869
4   328862         4605  0.118227
        user_id                                        top_5_tfidf
0             1  [2.109216555057508, 0.8532460580929809, 0.6127...
1             2  [1.2983805393879055, 1.1421873734049512, 0.192...
2             3  [1.0632388105052024,

# Seller

## Seller-brand

In [29]:
import pandas as pd
import numpy as np
from math import log

# Step 1: Modify to use brand_id and merchant_id as basis
merchant_brand_df = user_logs.copy()
merchant_brand_df.drop(columns=['action_type', 'time_stamp', 'user_id', 'item_id', 'cat_id'], inplace=True)

# Step 2: Count occurrences of brand_id for each merchant_id (Numerator of TF)
merchant_brand_df['brand_count'] = merchant_brand_df.groupby(['merchant_id', 'brand_id'])['brand_id'].transform('count')

# Step 3: Calculate total interactions for each merchant_id (Denominator of TF)
merchant_brand_df['total_interactions'] = merchant_brand_df.groupby('merchant_id')['brand_id'].transform('count')

# Step 4: Calculate TF (Term Frequency)
merchant_brand_df['tf'] = merchant_brand_df['brand_count'] / merchant_brand_df['total_interactions']
merchant_brand_df = merchant_brand_df.drop_duplicates(subset=['merchant_id', 'brand_id'])
print("tf > 1")
print(merchant_brand_df[merchant_brand_df['tf'] > 1])
print("="*70)

# Step 5: Calculate the document frequency (df) for each brand_id
df_brand_df = merchant_brand_df.groupby('brand_id')['merchant_id'].nunique().reset_index(name='df')

# Step 6: Calculate IDF (Inverse Document Frequency)
N = merchant_brand_df['merchant_id'].nunique()  # Total number of unique merchants
df_brand_df['idf'] = df_brand_df['df'].apply(lambda x: log(N / (x + 1)))  # IDF formula

print("idf > 1")
print(df_brand_df[df_brand_df['idf'] > 1])

# Step 7: Merge IDF back into the original DataFrame
merchant_brand_df = merchant_brand_df.merge(df_brand_df[['brand_id', 'idf']], on='brand_id', how='left')

# Step 8: Calculate TF-IDF
merchant_brand_df['tfidf'] = merchant_brand_df['tf'] * merchant_brand_df['idf']

# Clean up unnecessary columns
merchant_brand_df.drop(columns=['brand_count', 'total_interactions', 'tf', 'idf'], inplace=True)

# Display the first few rows of the transformed DataFrame
print(merchant_brand_df.head(5))

# Set the number of top tfidf values you want per merchant_id
n = 10  # Adjust this value as needed

# Group by merchant_id and keep the top n TF-IDF values for each merchant
grouped = (
    merchant_brand_df.groupby('merchant_id')['tfidf']
    .apply(lambda x: sorted(x, reverse=True)[:n])  # Sort and select top n
    .reset_index(name=f'top_{n}_tfidf')  # Reset index and rename column
)

# Save the output if needed
grouped.to_csv(f'merchant_brand.csv', index=False)  # Optional: save to a file

# Display the result
print(grouped)


tf > 1
Empty DataFrame
Columns: [merchant_id, brand_id, brand_count, total_interactions, tf]
Index: []
idf > 1
      brand_id  df       idf
1            1   1  7.823046
2            2   1  7.823046
3            3   1  7.823046
4            4   1  7.823046
5            5   1  7.823046
...        ...  ..       ...
8439      8473  12  5.951243
8440      8474   2  7.417580
8441      8475   2  7.417580
8442      8476  13  5.877135
8443      8477   2  7.417580

[8443 rows x 3 columns]
   merchant_id  brand_id     tfidf
0         2882      2661  7.412854
1         1253      1049  1.303665
2          883      1647  7.817731
3          420      4953  4.884490
4         4605      7622  6.717112
      merchant_id                                       top_10_tfidf
0               1  [7.253192437843725, 0.16313197455451023, 6.281...
1               2        [5.857305296983322, 0.00016188842475962936]
2               3         [7.811214817879197, 7.255906523725544e-05]
3               4  [7.77770912

## Seller-Item

In [30]:
import pandas as pd
import numpy as np
from math import log

# Step 1: Modify to use merchant_id and item_id
merchant_item_df = user_logs.copy()
merchant_item_df.drop(columns=['action_type', 'time_stamp', 'user_id', 'brand_id', 'cat_id'], inplace=True)

# Step 2: Count occurrences of item_id for each merchant_id (Numerator of TF)
merchant_item_df['item_count'] = merchant_item_df.groupby(['merchant_id', 'item_id'])['item_id'].transform('count')

# Step 3: Calculate total interactions for each merchant_id (Denominator of TF)
merchant_item_df['total_interactions'] = merchant_item_df.groupby('merchant_id')['item_id'].transform('count')

# Step 4: Calculate TF (Term Frequency)
merchant_item_df['tf'] = merchant_item_df['item_count'] / merchant_item_df['total_interactions']
merchant_item_df = merchant_item_df.drop_duplicates(subset=['merchant_id', 'item_id'])
print("tf > 1")
print(merchant_item_df[merchant_item_df['tf'] > 1])
print("="*70)

# Step 5: Calculate the document frequency (df) for each item_id
df_item_df = merchant_item_df.groupby('item_id')['merchant_id'].nunique().reset_index(name='df')

# Step 6: Calculate IDF (Inverse Document Frequency)
N = merchant_item_df['merchant_id'].nunique()  # Total number of unique merchants
df_item_df['idf'] = df_item_df['df'].apply(lambda x: log(N / (x + 1)))  # IDF formula

print("idf > 1")
print(df_item_df[df_item_df['idf'] > 1])

# Step 7: Merge IDF back into the original DataFrame
merchant_item_df = merchant_item_df.merge(df_item_df[['item_id', 'idf']], on='item_id', how='left')

# Step 8: Calculate TF-IDF
merchant_item_df['tfidf'] = merchant_item_df['tf'] * merchant_item_df['idf']

# Clean up unnecessary columns
merchant_item_df.drop(columns=['item_count', 'total_interactions', 'tf', 'idf'], inplace=True)

# Display the first few rows of the transformed DataFrame
print(merchant_item_df.head(5))

# Set the number of top tfidf values you want per merchant_id
n = 20  # Adjust this value as needed

# Group by merchant_id and keep the top n TF-IDF values for each merchant
grouped = (
    merchant_item_df.groupby('merchant_id')['tfidf']
    .apply(lambda x: sorted(x, reverse=True)[:n])  # Sort and select top n
    .reset_index(name=f'top_{n}_tfidf')  # Reset index and rename column
)

# Save the output if needed
grouped.to_csv(f'merchant_item.csv', index=False)  # Optional: save to a file

# Display the result
print(grouped)


tf > 1
Empty DataFrame
Columns: [item_id, merchant_id, item_count, total_interactions, tf]
Index: []
idf > 1
         item_id  df       idf
0              1   1  7.823046
1              2   1  7.823046
2              3   1  7.823046
3              4   1  7.823046
4              5   1  7.823046
...          ...  ..       ...
1090385  1113162   1  7.823046
1090386  1113163   1  7.823046
1090387  1113164   1  7.823046
1090388  1113165   1  7.823046
1090389  1113166   1  7.823046

[1090390 rows x 3 columns]
   item_id  merchant_id     tfidf
0   323294         2882  0.011632
1   844400         2882  0.002492
2   575153         2882  0.009139
3   996875         2882  0.004985
4  1086186         1253  0.017328
      merchant_id                                       top_20_tfidf
0               1  [0.07393070962205957, 0.06952485454005852, 0.0...
1               2  [1.8279068801474403, 0.5477121698636734, 0.316...
2               3  [2.7181016348470206, 0.4850583983840167, 0.464...
3          

## Seller-Cat

In [31]:
import pandas as pd
import numpy as np
from math import log

# Step 1: Modify to use merchant_id and cat_id
merchant_cat_df = user_logs.copy()
merchant_cat_df.drop(columns=['action_type', 'time_stamp', 'user_id', 'brand_id', 'item_id'], inplace=True)

# Step 2: Count occurrences of cat_id for each merchant_id (Numerator of TF)
merchant_cat_df['cat_count'] = merchant_cat_df.groupby(['merchant_id', 'cat_id'])['cat_id'].transform('count')

# Step 3: Calculate total interactions for each merchant_id (Denominator of TF)
merchant_cat_df['total_interactions'] = merchant_cat_df.groupby('merchant_id')['cat_id'].transform('count')

# Step 4: Calculate TF (Term Frequency)
merchant_cat_df['tf'] = merchant_cat_df['cat_count'] / merchant_cat_df['total_interactions']
merchant_cat_df = merchant_cat_df.drop_duplicates(subset=['merchant_id', 'cat_id'])
print("tf > 1")
print(merchant_cat_df[merchant_cat_df['tf'] > 1])
print("="*70)

# Step 5: Calculate the document frequency (df) for each cat_id
df_cat_df = merchant_cat_df.groupby('cat_id')['merchant_id'].nunique().reset_index(name='df')

# Step 6: Calculate IDF (Inverse Document Frequency)
N = merchant_cat_df['merchant_id'].nunique()  # Total number of unique merchants
df_cat_df['idf'] = df_cat_df['df'].apply(lambda x: log(N / (x + 1)))  # IDF formula

print("idf > 1")
print(df_cat_df[df_cat_df['idf'] > 1])

# Step 7: Merge IDF back into the original DataFrame
merchant_cat_df = merchant_cat_df.merge(df_cat_df[['cat_id', 'idf']], on='cat_id', how='left')

# Step 8: Calculate TF-IDF
merchant_cat_df['tfidf'] = merchant_cat_df['tf'] * merchant_cat_df['idf']

# Clean up unnecessary columns
merchant_cat_df.drop(columns=['cat_count', 'total_interactions', 'tf', 'idf'], inplace=True)

# Display the first few rows of the transformed DataFrame
print(merchant_cat_df.head(5))

# Set the number of top tfidf values you want per merchant_id
n = 10  # Adjust this value as needed

# Group by merchant_id and keep the top n TF-IDF values for each merchant
grouped = (
    merchant_cat_df.groupby('merchant_id')['tfidf']
    .apply(lambda x: sorted(x, reverse=True)[:n])  # Sort and select top n
    .reset_index(name=f'top_{n}_tfidf')  # Reset index and rename column
)

# Save the output if needed
grouped.to_csv(f'merchant_cat.csv', index=False)  # Optional: save to a file

# Display the result
print(grouped)


tf > 1
Empty DataFrame
Columns: [cat_id, merchant_id, cat_count, total_interactions, tf]
Index: []
idf > 1
      cat_id   df       idf
0          1   19  5.520460
1          2  276  2.892175
2          3   10  6.118297
3          4   15  5.743604
4          5   66  4.311500
...      ...  ...       ...
1653    1667    1  7.823046
1654    1668   69  4.267697
1655    1669    3  7.129898
1656    1670   17  5.625821
1657    1671    9  6.213608

[1658 rows x 3 columns]
   cat_id  merchant_id     tfidf
0     833         2882  0.046024
1    1271         2882  0.535697
2    1271         1253  0.581464
3    1467         2882  0.890357
4    1095          883  0.561572
      merchant_id                                       top_10_tfidf
0               1  [0.5800430706645079, 0.5065708162286712, 0.318...
1               2  [0.952426598072902, 0.8042220890388974, 0.2525...
2               3  [2.1449311506830266, 0.23788666660482732, 0.04...
3               4  [2.6104258954595196, 0.5939524089374005

# User-Merchant Profile

In [5]:
# Filter
user_merchant_df = user_logs.copy()
user_merchant_df.drop(columns=['action_type', 'time_stamp'], inplace=True)
sampled_df = pd.read_csv('train_df_sampled_X.csv')
sampled_df['user_id'] = sampled_df['user_id'].astype(np.int32)
sampled_df['merchant_id'] = sampled_df['merchant_id'].astype(np.int16)
user_merchant_df = user_merchant_df.merge(sampled_df, on=['user_id', 'merchant_id'], how='inner')
print(len(user_merchant_df))
print(len(user_logs))

1134844
54925330


In [6]:
print(len(sampled_df))

104345


## User-Merchant Cat

In [8]:
import pandas as pd
import numpy as np
from math import log

# Step 1: Modify to use user_id - merchant_id pairs as the basis for cat_id
user_merchant_cat_df = user_merchant_df.copy()
user_merchant_cat_df.drop(columns=['brand_id', 'item_id'], inplace=True)

# Step 2: Count occurrences of cat_id for each user_id - merchant_id pair (Numerator of TF)
user_merchant_cat_df['cat_count'] = user_merchant_cat_df.groupby(['user_id', 'merchant_id', 'cat_id'])['cat_id'].transform('count')

# Step 3: Calculate total interactions for each user_id - merchant_id pair (Denominator of TF)
user_merchant_cat_df['total_interactions'] = user_merchant_cat_df.groupby(['user_id', 'merchant_id'])['cat_id'].transform('count')

# Step 4: Calculate TF (Term Frequency)
user_merchant_cat_df['tf'] = user_merchant_cat_df['cat_count'] / user_merchant_cat_df['total_interactions']
user_merchant_cat_df = user_merchant_cat_df.drop_duplicates(subset=['user_id', 'merchant_id', 'cat_id'])

# Step 5: Calculate the document frequency (df) for each cat_id
# Count unique user_id - merchant_id pairs for each cat_id
df_cat_df = (
    user_merchant_cat_df.groupby('cat_id')[['user_id', 'merchant_id']]
    .nunique()
    .reset_index()
)
df_cat_df['df'] = df_cat_df[['user_id', 'merchant_id']].min(axis=1)

# Step 6: Calculate IDF (Inverse Document Frequency)
N = user_merchant_cat_df.groupby(['user_id', 'merchant_id']).ngroups  # Total unique user_id - merchant_id pairs
df_cat_df['idf'] = df_cat_df['df'].apply(lambda x: log(N / (x + 1)))  # IDF formula

# Step 7: Merge IDF back into the original DataFrame
user_merchant_cat_df = user_merchant_cat_df.merge(df_cat_df[['cat_id', 'idf']], on='cat_id', how='left')

# Step 8: Calculate TF-IDF
user_merchant_cat_df['tfidf'] = user_merchant_cat_df['tf'] * user_merchant_cat_df['idf']

# Clean up unnecessary columns
user_merchant_cat_df.drop(columns=['cat_count', 'total_interactions', 'tf', 'idf'], inplace=True)

# Display the first few rows of the transformed DataFrame
print(user_merchant_cat_df.head(5))

# Set the number of top tfidf values you want per user_id - merchant_id pair
n = 5  # Adjust this value as needed

# Group by user_id - merchant_id and keep the top n TF-IDF values for each pair
grouped = (
    user_merchant_cat_df.groupby(['user_id', 'merchant_id'])['tfidf']
    .apply(lambda x: x.nlargest(n).tolist())  # Use nlargest for efficiency
    .reset_index(name=f'top_{n}_tfidf')  # Reset index and rename column
)

# Save the output if needed
grouped.to_csv(f'user_merchant_cat.csv', index=False)  # Optional: save to a file

# Display the result
print(grouped)


   user_id  cat_id  merchant_id     tfidf
0   356311    1129         3205  7.794258
1   153790     898         1346  8.000110
2    26516     602         2403  7.726817
3   422265    1074         1273  8.722245
4   255390     611         4585  5.376644
        user_id  merchant_id   
0             1         1019  \
1             6         1356   
2            14          361   
3            16         1435   
4            17         1115   
...         ...          ...   
104340   424155         1394   
104341   424157          798   
104342   424163         3826   
104343   424164          606   
104344   424167         1200   

                                              top_5_tfidf  
0                                     [7.022858502535675]  
1                                      [8.78286927344915]  
2                                     [6.819259547294435]  
3                                      [6.71127090923034]  
4       [3.6699608402064086, 2.9171112474590464, 0.338...  
...

## User-Merchant Brand

In [9]:
import pandas as pd
import numpy as np
from math import log

# Step 1: Modify to use user_id - merchant_id pairs as the basis for brand_id
user_merchant_brand_df = user_merchant_df.copy()
user_merchant_brand_df.drop(columns=['cat_id', 'item_id'], inplace=True)

# Step 2: Count occurrences of brand_id for each user_id - merchant_id pair (Numerator of TF)
user_merchant_brand_df['brand_count'] = user_merchant_brand_df.groupby(['user_id', 'merchant_id', 'brand_id'])['brand_id'].transform('count')

# Step 3: Calculate total interactions for each user_id - merchant_id pair (Denominator of TF)
user_merchant_brand_df['total_interactions'] = user_merchant_brand_df.groupby(['user_id', 'merchant_id'])['brand_id'].transform('count')

# Step 4: Calculate TF (Term Frequency)
user_merchant_brand_df['tf'] = user_merchant_brand_df['brand_count'] / user_merchant_brand_df['total_interactions']
user_merchant_brand_df = user_merchant_brand_df.drop_duplicates(subset=['user_id', 'merchant_id', 'brand_id'])

# Step 5: Calculate the document frequency (df) for each brand_id
# Count unique user_id - merchant_id pairs for each brand_id
df_brand_df = (
    user_merchant_brand_df.groupby('brand_id')[['user_id', 'merchant_id']]
    .nunique()
    .reset_index()
)
df_brand_df['df'] = df_brand_df[['user_id', 'merchant_id']].min(axis=1)

# Step 6: Calculate IDF (Inverse Document Frequency)
N = user_merchant_brand_df.groupby(['user_id', 'merchant_id']).ngroups  # Total unique user_id - merchant_id pairs
df_brand_df['idf'] = df_brand_df['df'].apply(lambda x: log(N / (x + 1)))  # IDF formula

# Step 7: Merge IDF back into the original DataFrame
user_merchant_brand_df = user_merchant_brand_df.merge(df_brand_df[['brand_id', 'idf']], on='brand_id', how='left')

# Step 8: Calculate TF-IDF
user_merchant_brand_df['tfidf'] = user_merchant_brand_df['tf'] * user_merchant_brand_df['idf']

# Clean up unnecessary columns
user_merchant_brand_df.drop(columns=['brand_count', 'total_interactions', 'tf', 'idf'], inplace=True)

# Display the first few rows of the transformed DataFrame
print(user_merchant_brand_df.head(5))

# Set the number of top tfidf values you want per user_id - merchant_id pair
n = 5  # Adjust this value as needed

# Group by user_id - merchant_id and keep the top n TF-IDF values for each pair
grouped = (
    user_merchant_brand_df.groupby(['user_id', 'merchant_id'])['tfidf']
    .apply(lambda x: x.nlargest(n).tolist())  # Use nlargest for efficiency
    .reset_index(name=f'top_{n}_tfidf')  # Reset index and rename column
)

# Save the output if needed
grouped.to_csv(f'user_merchant_brand.csv', index=False)  # Optional: save to a file

# Display the result
print(grouped)


   user_id  merchant_id  brand_id      tfidf
0   356311         3205      2270  10.862311
1   153790         1346      7995   9.157563
2    26516         2403      2373   4.344924
3    26516         2403      8417   6.517386
4   422265         1273      2753  10.862311
        user_id  merchant_id           top_5_tfidf
0             1         1019  [10.862310815128986]
1             6         1356   [10.16916363456904]
2            14          361  [10.862310815128986]
3            16         1435  [10.862310815128986]
4            17         1115  [10.862310815128986]
...         ...          ...                   ...
104340   424155         1394   [8.916400666073672]
104341   424157          798  [10.862310815128986]
104342   424163         3826  [10.456845707020822]
104343   424164          606  [10.862310815128986]
104344   424167         1200    [8.29736145766745]

[104345 rows x 3 columns]


## User-Merchant Item

In [10]:
import pandas as pd
import numpy as np
from math import log

# Step 1: Modify to use user_id - merchant_id pairs as the basis for item_id
user_merchant_item_df = user_merchant_df.copy()
user_merchant_item_df.drop(columns=['cat_id', 'brand_id'], inplace=True)

# Step 2: Count occurrences of item_id for each user_id - merchant_id pair (Numerator of TF)
user_merchant_item_df['item_count'] = user_merchant_item_df.groupby(['user_id', 'merchant_id', 'item_id'])['item_id'].transform('count')

# Step 3: Calculate total interactions for each user_id - merchant_id pair (Denominator of TF)
user_merchant_item_df['total_interactions'] = user_merchant_item_df.groupby(['user_id', 'merchant_id'])['item_id'].transform('count')

# Step 4: Calculate TF (Term Frequency)
user_merchant_item_df['tf'] = user_merchant_item_df['item_count'] / user_merchant_item_df['total_interactions']
user_merchant_item_df = user_merchant_item_df.drop_duplicates(subset=['user_id', 'merchant_id', 'item_id'])

# Step 5: Calculate the document frequency (df) for each item_id
# Count unique user_id - merchant_id pairs for each item_id
df_item_df = (
    user_merchant_item_df.groupby('item_id')[['user_id', 'merchant_id']]
    .nunique()
    .reset_index()
)
df_item_df['df'] = df_item_df[['user_id', 'merchant_id']].min(axis=1)

# Step 6: Calculate IDF (Inverse Document Frequency)
N = user_merchant_item_df.groupby(['user_id', 'merchant_id']).ngroups  # Total unique user_id - merchant_id pairs
df_item_df['idf'] = df_item_df['df'].apply(lambda x: log(N / (x + 1)))  # IDF formula

# Step 7: Merge IDF back into the original DataFrame
user_merchant_item_df = user_merchant_item_df.merge(df_item_df[['item_id', 'idf']], on='item_id', how='left')

# Step 8: Calculate TF-IDF
user_merchant_item_df['tfidf'] = user_merchant_item_df['tf'] * user_merchant_item_df['idf']

# Clean up unnecessary columns
user_merchant_item_df.drop(columns=['item_count', 'total_interactions', 'tf', 'idf'], inplace=True)

# Display the first few rows of the transformed DataFrame
print(user_merchant_item_df.head(5))

# Set the number of top tfidf values you want per user_id - merchant_id pair
n = 5  # Adjust this value as needed

# Group by user_id - merchant_id and keep the top n TF-IDF values for each pair
grouped = (
    user_merchant_item_df.groupby(['user_id', 'merchant_id'])['tfidf']
    .apply(lambda x: x.nlargest(n).tolist())  # Use nlargest for efficiency
    .reset_index(name=f'top_{n}_tfidf')  # Reset index and rename column
)

# Save the output if needed
grouped.to_csv(f'user_merchant_item.csv', index=False)  # Optional: save to a file

# Display the result
print(grouped)


   user_id  item_id  merchant_id     tfidf
0   356311   758374         3205  4.023078
1   356311   413046         3205  0.804616
2   356311   113205         3205  1.609231
3   356311   998103         3205  2.011539
4   356311   830436         3205  1.206923
        user_id  merchant_id   
0             1         1019  \
1             6         1356   
2            14          361   
3            16         1435   
4            17         1115   
...         ...          ...   
104340   424155         1394   
104341   424157          798   
104342   424163         3826   
104343   424164          606   
104344   424167         1200   

                                              top_5_tfidf  
0                                    [10.862310815128986]  
1                                    [10.862310815128986]  
2       [1.9749656027507247, 1.9749656027507247, 1.481...  
3       [3.9499312055014495, 2.9624484041260866, 0.987...  
4       [5.974270948320942, 3.2586932445386956, 0.5431...