### 의존성 파일 설치

In [None]:
# !pip install google-cloud-bigquery
# !pip install gspread oauth2client
# !pip install db-dtypes
# !pip install pandas-gbq

In [None]:
from google.oauth2 import service_account
from google.cloud import bigquery

import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime

from crwlnt.notiAPI.product import NotiServerRequest

import time
import pickle
import pandas_gbq

## 구글 클라우드 연동 및 BigQuery에서 데이터 불러오기

In [None]:
SERVICE_ACCOUNT_FILE = "./config/level3-416207-893f91c9529e.json"  # 키 json 파일

# Credentials 객체 생성
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE)

# 빅쿼리 클라이언트 객체 생성
project_id = "level3-416207"
client = bigquery.Client(credentials=credentials, project=project_id)

# 쿼리 실행
# 빅쿼리 디렉토리는 <프로젝트ID>.<데이터셋ID>.<테이블ID> 순으로 저장되어있음 ex) level3-416207.l3_30.l3_30
QUERY = (
    '''
    SELECT hashed_ip, local_time, request_url_endpoint, uri_first
    FROM `level3-416207.log_129.revised_log_129`
    WHERE local_time >= TIMESTAMP '2024-01-29 00:00:00 UTC'
    ''')


# API request
df = client.query(QUERY).to_dataframe()
df = df.sort_values('local_time')
df

## /spec/

In [None]:
df_spec = df[df['uri_first']=='spec']
df_spec['products'] = df_spec['request_url_endpoint'].map(lambda x: x[6:])
df_spec

In [None]:
df_new_spec = df_spec.loc[:, ['hashed_ip', 'local_time', 'products', 'uri_first']]
df_new_spec

In [None]:
df_new_spec.info()

## /redirect/

In [None]:
df_redirect = df[df['uri_first']=='redirect']
df_redirect['products'] = df_redirect['request_url_endpoint'].map(lambda x: x[10:])
df_redirect = df_redirect[df_redirect['products'].str.startswith('?p=')]
df_redirect['products'] = df_redirect['products'].map(lambda x: x.split('&')[0].split('=')[1] + '-' + x.split('&')[1].split('=')[1] + '-' +x.split('&')[2].split('=')[1])
df_redirect

In [None]:
df_new_redirect = df_redirect.loc[:,['hashed_ip', 'local_time', 'products', 'uri_first']]
df_new_redirect

In [None]:
df_new_redirect.info()

## /

In [None]:
df_re = df[df['uri_first'].str.startswith('?p=', na=False)]

df_re['products'] = None
for i in list(df_re.index):
    try:
        df_re.loc[i,'products'] = df_re.loc[i, 'uri_first'].split('&')[0].split('=')[1] + '-' + df_re.loc[i, 'uri_first'].split('&')[1].split('=')[1] + '-' + df_re.loc[i, 'uri_first'].split('&')[2].split('=')[1]
    except:
        pass

df_re

In [None]:
df_new_re = df_re[df_re['products'].isna()==False].loc[:,['hashed_ip', 'local_time', 'products']]
df_new_re['uri_first'] = 'redirect'
df_new_re

In [None]:
df_new_re.info()

## Concat

In [None]:
interaction = pd.concat([df_new_re, df_new_redirect, df_new_spec], axis=0, ignore_index=True)

interaction['products'] = interaction['products'].astype(str)

for i in tqdm(list(interaction.index)):
    if '/' in interaction.loc[i,'products']:
        interaction.loc[i, 'products'] = interaction.loc[i, 'products'].split('/')[0]

interaction = interaction[interaction['products']!='']
interaction

## product_id/piv 형태만 남기고 검색되는 product의 interaction만 남기기

In [None]:
interaction_product_id = interaction[interaction['products'].apply(lambda x: len(str(x)) == 64)]
interaction_piv = interaction[interaction['products'].str.contains('-')]

product_id_list = list(set(interaction_product_id['products'].values))
product_piv_list = list(set(interaction_piv['products'].values))


print(len(product_id_list))
print(len(product_piv_list))
print(len(product_id_list) + len(product_piv_list))

In [None]:
# 원하는 개수로 product_id/piv 리스트 자르기
def chunk_array(array, chunk_size):
    return [array[i:i+chunk_size] for i in range(0, len(array), chunk_size)]


# 원하는 개수만큼 request 보내기
def send_array_in_chunks(array, chunk_size, type):
    chunks = chunk_array(array, chunk_size)
    product_df = pd.DataFrame(columns=['id', 'piv', 'title'])
    for chunk in chunks:
        if type == 'ids':
            res = NotiServerRequest.bulk_product_info(ids=chunk)
        else:
            res = NotiServerRequest.bulk_product_info(pivs=chunk)

        result = res.json()['data']['products']
        for j in range(len(result)):
            try:
                product_df.loc[len(product_df)] = [result[j]['id'], result[j]['piv'], result[j]['title']]
            except:
                pass

        print(len(product_df))
        time.sleep(11)
    return product_df

In [None]:
# n개씩 묶어서 보내기, 남은 원소들 보내기
chunk_size = 5000

id_list_df = send_array_in_chunks(product_id_list, chunk_size, 'ids')
piv_list_df = send_array_in_chunks(product_piv_list, chunk_size, 'pivs')

In [None]:
# 검색해도 안 나오는 product_id, piv 지우기
# product_~~_list = 로그에 등장한 product_id/piv 중복 없이 전부 다 있는 list
# ~~_list_df = api로 검색했을 때 검색이 된 상품을 모아놓은 df
# set(product_id_list) - set(id_list_df['id'].values) => 차집합 이용해서 전체로그에는 있지만 검색이 안 된 product_id 찾기

id_error_dict = {id_error:None for id_error in list(set(product_id_list) - set(id_list_df['id'].values))}
piv_error_dict = {piv_error:None for piv_error in list(set(product_piv_list) - set(piv_list_df['piv'].values))}

interaction_product_id['products'] = interaction_product_id['products'].map(lambda x: id_error_dict[x] if x in list(id_error_dict.keys()) else x)
interaction_piv['products'] = interaction_piv['products'].map(lambda x: piv_error_dict[x] if x in list(piv_error_dict.keys()) else x)

interaction_product_id = interaction_product_id[interaction_product_id['products'].notnull()]
interaction_piv = interaction_piv[interaction_piv['products'].notnull()]

## 파생) item.csv 파일 저장 및 불러오기
- id, piv, title 포함

In [None]:
product_data = pd.concat([id_list_df, piv_list_df], axis=0, ignore_index=True)
product_data['title'] = product_data['title'].map(lambda x: x.replace("'",'').replace(',','').replace('(', ' ').replace(')', ' '))
product_data['title'] = product_data['title'].map(lambda x: x.lower())
product_data['title'] = product_data['title'].map(lambda x: x.split(' '))
product_data['title'] = product_data['title'].map(lambda x: ' '.join(x).split())
product_data['title'] = product_data['title'].map(lambda x: ' '.join(x))
product_data = product_data.drop_duplicates(keep='first').reset_index(drop=True)
product_data

In [None]:
product_data.to_csv('./item.csv', index=False)

## interaction 파일 저장하기

In [None]:
# piv가 필요하다면 실행하지 않아야 할 셀!
# piv:id dict 만들기
piv_id_dict = { product_data.loc[i, 'piv']:product_data.loc[i, 'id'] for i in tqdm(range(len(product_data)))}

# piv를 product_id로 바꾸기
interaction_piv['products'] = interaction_piv['products'].map(piv_id_dict)

In [None]:
# interaction dataframe 만들기
interaction_data = pd.concat([interaction_piv, interaction_product_id], axis=0, ignore_index=True)
interaction_data['uri_first'] = interaction_data['uri_first'].map(lambda x: 1 if x=='spec' else 0)
interaction_data = interaction_data.sort_values('local_time').reset_index(drop=True)

In [None]:
interaction_data.to_csv('./crwlnt/data_csv/interaction/inter_240129.csv', index=False)

## idx

In [None]:
user_to_idx = {ip:idx for idx, ip in enumerate(interaction_data['hashed_ip'].unique())}
idx_to_user = {idx:ip for idx, ip in enumerate(interaction_data['hashed_ip'].unique())}
item_to_idx = {pid:idx for idx, pid in enumerate(product_data['id'].unique())}
idx_to_item = {idx:pid for idx, pid in enumerate(product_data['id'].unique())}

In [None]:
with open('user_to_idx.pickle','wb') as fw:
    pickle.dump(user_to_idx, fw)

with open('idx_to_user.pickle','wb') as fw:
    pickle.dump(idx_to_user, fw)

with open('item_to_idx.pickle','wb') as fw:
    pickle.dump(item_to_idx, fw)

with open('idx_to_item.pickle','wb') as fw:
    pickle.dump(idx_to_item, fw)

## BigQuery에 업로드

In [None]:
import pandas_gbq

# 업로드할 데이터 경로 설정/ 없는 경로로 설정해주면 새로 생성해줌
upload_project_id = "level3-416207" 
upload_dataset_id = 'l3_30'
upload_table_id = 'upload_test'

# 업로드
#pandas_gbq.to_gbq(df, destination_table=f'{upload_dataset_id}.{upload_table_id}', project_id=upload_project_id, if_exists='replace', credentials=credentials)

# '''
# if_exists 매개변수
# 'fail': 기존 테이블이 이미 존재하는 경우에는 업로드를 실패시킵니다. 기본값은 'fail'입니다.
# 'replace': 기존 테이블이 이미 존재하는 경우에는 해당 테이블을 덮어씁니다.
# 'append': 기존 테이블이 이미 존재하는 경우에는 데이터를 테이블에 추가합니다.
# ''' 