In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

In [None]:
df = pd.read_csv('/home/user/interaction_240113_final.csv')
df

In [32]:
# user별 interaction 개수를 계산
user_interaction_counts = df['hashed_ip'].value_counts()
# interaction이 3개 이상인 user의 목록을 가져옴
selected_users = user_interaction_counts[user_interaction_counts >= 3].index
# interaction이 3개 이상인 user에 대한 데이터만 남김
df = df[df['hashed_ip'].isin(selected_users)]

In [None]:
df

In [None]:
with open('/home/user/product_info_df.pickle', 'rb') as fr:
    product_info = pickle.load(fr)
product_info

In [None]:
product_data = product_info.copy()
product_data['title'] = product_data['title'].map(lambda x: x.replace("'",'').replace(',','').replace('(', ' ').replace(')', ' '))
product_data['title'] = product_data['title'].map(lambda x: x.lower())
product_data['title'] = product_data['title'].map(lambda x: x.split(' '))
product_data['title'] = product_data['title'].map(lambda x: ' '.join(x).split())
product_data['title'] = product_data['title'].map(lambda x: ' '.join(x))
product_data

## piv를 product_id로 변환하기

In [None]:
# piv:id dict 만들기
piv_id_dict = { product_info.loc[i, 'piv']:product_info.loc[i, 'id'] for i in tqdm(range(len(product_info)))}

In [None]:
# piv 있는 행과 없는 행 구분하기(속도를 위해 구분)
product_id_df = df[df['products'].str.contains('-') == False]
piv_df = df[df['products'].str.contains('-')]

# piv를 product_id로 바꾸기
piv_df['products'] = piv_df['products'].map(piv_id_dict)

# piv 있는 행과 없는 행 다시 concat하기
data = pd.concat([product_id_df, piv_df], axis=0, ignore_index=False)
data = data.sort_index()
data

In [None]:
# hashed_ip별 상호작용한 아이템 목록
user_items=data.groupby('hashed_ip')['products'].apply(set).apply(list).to_dict()
user_items

In [None]:
data[data['products'].str.contains('-')]

## Creating view document

In [14]:
data_drop_local_time = data.drop(columns='local_time', axis=0)
grouped = data_drop_local_time.groupby('hashed_ip')

In [15]:
view_document = []
last_view = []
for hashed_ip, session in grouped:
    if len(session) > 1:
        view_document.append(session['products'][:-1].astype(str).to_list())
        last_view.append(session['products'][-1:].astype(str).to_list())

## Creating dictionary of products

In [None]:
dict_products = product_data[['id','title']].set_index('id').to_dict()['title']
dict_products

In [None]:
data['products_name']=data['products'].map(dict_products)
data

In [None]:
user_items=data.groupby('hashed_ip')['products_name'].apply(set).apply(list).to_dict()
user_items

In [None]:
print(len(dict_products))

## TopicModeling

In [None]:
!pip install gensim
!pip install pyLDAvis

In [18]:
from gensim.parsing.preprocessing import preprocess_string
from gensim.corpora import Dictionary
from gensim.models import LdaModel

In [70]:
documents = list(user_items.values())
dictionary = Dictionary(documents)
corpus = [dictionary.doc2bow(document) for document in documents]

##### optimize num_topics

In [110]:
import gensim
from gensim.models import CoherenceModel

In [None]:
topic_range = range(2, 21)
coherence_scores = []
for num_topics in topic_range:
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10)
    coherence_model = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)
    print(f"Num Topics: {num_topics}, Coherence Score: {coherence_score}")

optimal_num_topics = topic_range[np.argmax(coherence_scores)]
print(f"Optimal Number of Topics: {optimal_num_topics}")

## train

In [112]:
num_topics = 5
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, passes=10)

##### Check details(optional)

In [None]:
# topic확인
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

In [None]:
# 각 사용자에 대한 토픽 분포
for i, row_list in enumerate(lda_model[corpus]):
    print(f"User {list(user_items.keys())[i]}'s topic distribution: {row_list}")

In [None]:
# 각 아이템의 토픽 분포 확인
# result: (토픽 번호, 토픽에 속할 확률)
user_embeddings = [lda_model.get_document_topics(item) for item in corpus]
user_embeddings

In [None]:
# 각 아이템의 토픽 분포 확인
# result: (토픽 번호, 토픽에 속할 확률)
for doc_topics in lda_model.get_document_topics(corpus):
    print(doc_topics)

In [None]:
# 예시
user_index = list(user_items.keys()).index('000d993b424a2e62dc24078df07d551a')
user_index

In [None]:
# user vector
doc_id = 153  # 조회하고 싶은 사용자 ID
doc_bow = corpus[doc_id]
doc_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0)

print(f"Document #{doc_id} Topics:")
for topic, prob in doc_topics:
    print(f"Topic {topic}: {prob}")

## test

In [120]:
# pid로 학습한 경우 사용
# def details_of_related_items(topic_index):
#     print(f"Topic #{topic_index}:")
#     print('-'*50)
#     for word_id, prob in lda_model.get_topic_terms(topic_index, topn=30):
#         print(f"{dictionary[word_id]} {dict_products[dictionary[word_id]]} (확률: {prob:.10f})")

def details_of_related_items(topic_index):
    print(f"Topic #{topic_index}:")
    print('-'*50)
    for word_id, prob in lda_model.get_topic_terms(topic_index, topn=30):
        print(f"{dictionary[word_id]} (확률: {prob:.10f})")

In [None]:
for i in range(5):
    print(f"--------------------{i}--------------------")
    details_of_related_items(i)

## Visualization

In [24]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [None]:
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)