## (변성윤 마스터님) 구글 빅쿼리 명령어(쿼리) 참고자료: https://zzsza.github.io/bigquery/guide.html

### 의존성 파일 설치

In [None]:
# !pip install google-cloud-bigquery
# !pip install gspread oauth2client
# !pip install db-dtypes
# !pip install pandas-gbq

## 구글 클라우드 연동 및 BigQuery에서 데이터 불러오기

In [None]:
from google.oauth2 import service_account
from google.cloud import bigquery

SERVICE_ACCOUNT_FILE = "./config/level3-416207-893f91c9529e.json"  # 키 json 파일

# Credentials 객체 생성
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE)

# 빅쿼리 클라이언트 객체 생성
project_id = "level3-416207"
client = bigquery.Client(credentials=credentials, project=project_id)

# 쿼리 실행
# 빅쿼리 디렉토리는 <프로젝트ID>.<데이터셋ID>.<테이블ID> 순으로 저장되어있음 ex) level3-416207.l3_30.l3_30
QUERY = (
    '''
    SELECT hashed_ip, local_time, request_url_endpoint, uri_first
    FROM `level3-416207.log_129.revised_log_129`
    WHERE local_time >= TIMESTAMP '2024-01-13 00:00:00 UTC'
    ''')


# API request
df = client.query(QUERY).to_dataframe()
df

## /products/

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
df_products = df[df['uri_first'] == 'products']

In [None]:
df_products['products'] = df_products['request_url_endpoint'].map(lambda x: x[10:])
df_products = df_products[df_products['products'].str.startswith('?path=')==False]

In [None]:
df_products

In [None]:
df_new_products = df_products.loc[:,['hashed_ip', 'local_time', 'products']]
df_new_products

In [None]:
df_new_products.info()

## /spec/

In [None]:
df_spec = df[df['uri_first']=='spec']

In [None]:
df_spec['products'] = df_spec['request_url_endpoint'].map(lambda x: x[6:])

In [None]:
df_new_spec = df_spec.loc[:, ['hashed_ip', 'local_time', 'products']]
df_new_spec

In [None]:
df_new_spec.info()

## /redirect/

In [None]:
df_redirect = df[df['uri_first']=='redirect']
df_redirect

In [None]:
df_redirect['products'] = df_redirect['request_url_endpoint'].map(lambda x: x[10:])
df_redirect = df_redirect[df_redirect['products'].str.startswith('?p=')]
df_redirect['products'] = df_redirect['products'].map(lambda x: x.split('&')[0].split('=')[1] + '-' + x.split('&')[1].split('=')[1] + '-' +x.split('&')[2].split('=')[1])
df_redirect

In [None]:
df_new_redirect = df_redirect.loc[:,['hashed_ip', 'local_time', 'products']]
df_new_redirect

In [None]:
df_new_redirect.info()

## /

In [None]:
df_re = df[df['uri_first'].str.startswith('?p=', na=False)]

In [None]:
df_re['products'] = None
for i in list(df_re.index):
    try:
        df_re.loc[i,'products'] = df_re.loc[i, 'uri_first'].split('&')[0].split('=')[1] + '-' + df_re.loc[i, 'uri_first'].split('&')[1].split('=')[1] + '-' + df_re.loc[i, 'uri_first'].split('&')[2].split('=')[1]
    except:
        pass

df_re

In [None]:
df_new_re = df_re[df_re['products'].isna()==False].loc[:,['hashed_ip', 'local_time', 'products']]
df_new_re

In [None]:
df_new_re.info()

## Concat

In [None]:
product = pd.concat([df_new_products, df_new_re, df_new_redirect, df_new_spec], axis=0, ignore_index=True)
product

In [None]:
product['products'] = product['products'].astype(str)

for i in tqdm(list(product.index)):
    if '/' in product.loc[i,'products']:
        product.loc[i, 'products'] = product.loc[i, 'products'].split('/')[0]

product = product[product['products']!='']
product

## product_id, piv 형태만 남기기

In [None]:
product_piv = product[product['products'].apply(lambda x: len(str(x)) == 64)]
product_pid = product[product['products'].str.contains('-')]

product = pd.concat([product_piv, product_pid], axis=0, ignore_index=True)
product = product.sort_values('local_time')
product.shape

In [None]:
product.to_csv('./crwlnt/data_csv/interaction/interaction_240113.csv', index=False)

## BigQuery에 업로드

In [None]:
import pandas_gbq

# 업로드할 데이터 경로 설정/ 없는 경로로 설정해주면 새로 생성해줌
upload_project_id = "level3-416207" 
upload_dataset_id = 'l3_30'
upload_table_id = 'upload_test'

# 업로드
#pandas_gbq.to_gbq(df, destination_table=f'{upload_dataset_id}.{upload_table_id}', project_id=upload_project_id, if_exists='replace', credentials=credentials)

# '''
# if_exists 매개변수
# 'fail': 기존 테이블이 이미 존재하는 경우에는 업로드를 실패시킵니다. 기본값은 'fail'입니다.
# 'replace': 기존 테이블이 이미 존재하는 경우에는 해당 테이블을 덮어씁니다.
# 'append': 기존 테이블이 이미 존재하는 경우에는 데이터를 테이블에 추가합니다.
# ''' 