모델 학습을 위한 데이터셋을 생성

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers &> /dev/null
from transformers import BertTokenizer
from transformers import AutoTokenizer

import csv
import gzip
import json
import pickle
import numpy as np

In [None]:
def preprocess_amazon_from_json(json_filename = 'AMAZON_FASHION_5.json'):
  review_texts = []
  ratings = []
  products = []

  root = '/content/drive/MyDrive/CS470_team_2in1'
  with open(root+'/'+'dataset'+'/'+json_filename, "r") as json_file:
    for line in json_file:
      info = json.loads(line)

      try:
        review_text = info["reviewText"]
        rating = int(info["overall"])
        product = info["asin"]
      except:
        continue
      
      review_texts.append(review_text)
      ratings.append(rating)
      products.append(product)

  print(f'collected {len(review_texts)} datas')
  return review_texts, ratings, products

In [None]:
def preprocess_amazon_from_gzip(json_filename = 'Books_5.json.gz', data_num=None, start_idx=0):
  review_texts = []
  ratings = []
  products = []

  root = '/content/drive/MyDrive/CS470_team_2in1'
  idx = 0
  with gzip.open(root+'/'+'dataset'+'/'+json_filename, "rb") as f:
    for line in f:
      idx+=1
      if start_idx>idx: continue
      
      info = json.loads(line)

      try:
        review_text = info["reviewText"]
        rating = int(info["overall"])
        product = info["asin"]
      except:
        continue
      
      review_texts.append(review_text)
      ratings.append(rating)
      products.append(product)

      if data_num and len(review_texts)==data_num: break
      if len(review_texts)%100000==0: print(f"processed {len(review_texts)} datas...")

  print(f'collected {len(review_texts)} datas')
  return review_texts, ratings, products

In [None]:
def generate_tokenized_dataset(review_texts, ratings, products):
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

  input_ids = []
  attention_masks = []

  for sent in review_texts:
      encoded_dict = tokenizer.encode_plus(
                          sent, 
                          add_special_tokens = True,
                          max_length = 512,
                          truncation=True,
                          pad_to_max_length = True,
                          return_attention_mask = True,
                          return_tensors = 'pt',
                    )
      
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])

      if len(input_ids)%100000==0: print(f"tonkenized {len(input_ids)} datas...")

  dataset = np.array([input_ids, attention_masks, ratings, products])
  print("dataset is gernerated")
  return dataset


In [None]:
review_texts, ratings, products = preprocess_amazon_from_gzip('Books_5.json.gz',data_num=500000)
dataset = generate_tokenized_dataset(review_texts, ratings, products)
np.save("/content/drive/MyDrive/CS470_team_2in1/dataset"+"/"+"amazon_book_only500000.npy", dataset)

In [None]:
with open("/content/drive/MyDrive/CS470_team_2in1/dataset/"+'preprocessed_balanced_encoded.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)

    reviewText = []
    overall = []
    productId = []

    fields = ['index', 'sentence index', 'reviewText', 'overall', 'productID', 'positive rationale', 'negative rationale', 'positive rationale encoded', 'negative rationale encoded']
    
    for i, row in enumerate(csvreader):
        if(i==0): continue
        reviewText.append(row[2])
        overall.append(int(row[3]))
        productId.append(row[4])

dataset = generate_tokenized_dataset(reviewText, overall, productId)
np.save("/content/drive/MyDrive/CS470_team_2in1/dataset"+"/"+"rationale_preprocessed.npy", dataset)