In [1]:
import numpy as np
import pandas as pd
import json
import gzip
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize


# 'C:\\Users\\emovi\\Desktop\\VibeCaster\\VibeCaster\\data\\Software_5.json'
#C:\Users\emovi\Desktop\VibeCaster\VibeCaster\data\Software_5.json(1).gz

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emovi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
## load the datset into DataFrame as described on the website: https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/#subsets
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')
# small dataset, ca. 77 000 Entries
# df = getDF('C:\\Users\\emovi\\Desktop\\VibeCaster\\VibeCaster\\data\\Industrial_and_Scientific_5.json.gz')
# way bigger dataset ca. 500 000 Entries
df = getDF('C:\\Users\\emovi\\Desktop\\VibeCaster\\VibeCaster\\data\\Video_Games_5.json.gz')

In [3]:
 # summary statistics
print(df.shape) # 
print(df.info())
print(df.describe())

(497577, 12)
<class 'pandas.core.frame.DataFrame'>
Index: 497577 entries, 0 to 497576
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         497577 non-null  float64
 1   verified        497577 non-null  bool   
 2   reviewTime      497577 non-null  object 
 3   reviewerID      497577 non-null  object 
 4   asin            497577 non-null  object 
 5   reviewerName    497501 non-null  object 
 6   reviewText      497419 non-null  object 
 7   summary         497468 non-null  object 
 8   unixReviewTime  497577 non-null  int64  
 9   vote            107793 non-null  object 
 10  style           289237 non-null  object 
 11  image           3634 non-null    object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 46.0+ MB
None
             overall  unixReviewTime
count  497577.000000    4.975770e+05
mean        4.220456    1.367848e+09
std         1.185424    1.224113e+08
min         

In [4]:
print(df.isna().sum())

overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
reviewerName          76
reviewText           158
summary              109
unixReviewTime         0
vote              389784
style             208340
image             493943
dtype: int64


In [5]:
# delete entrys with missing reviewText: 
df.dropna(subset=['reviewText', 'overall'], inplace=True)

In [6]:
# only keep overall and reviewText
all_columns = df.columns.tolist()

columns_to_keep = ['overall', 'reviewText']

columns_to_drop = [col for col in all_columns if col not in columns_to_keep]

df.drop(columns=columns_to_drop, inplace=True)

In [7]:
print(df.describe())

             overall
count  497419.000000
mean        4.220297
std         1.185491
min         1.000000
25%         4.000000
50%         5.000000
75%         5.000000
max         5.000000


In [8]:
# create new col sentiment to train the model on
df['sentiment'] = df['overall'].apply(lambda x: 'positive' if x > 3 else ('neutral' if x == 3 else 'negative'))
# Count the number of each sentiment
print(df['sentiment'].value_counts())


sentiment
positive    393267
negative     55012
neutral      49140
Name: count, dtype: int64


In [9]:
## dataset is heavily skewed on the positive side of things so might need to use cross-validation to account for it
# we only need reviewText and Sentiment to start training our model so lets drop the 'overall' column
df.drop(columns="overall", inplace=True)


In [10]:
# Using the BERT tokenizer to preprocess the text.
# The function `tokenize_reviews` encodes the text into token IDs, adds special tokens ([CLS], [SEP]), and pads the sequence to a maximum length of 512 tokens.
# It also generates an attention mask to differentiate real tokens from [PAD] tokens.
# The resulting 'data' column in the DataFrame will contain the token IDs and attention masks for each review.
 
from transformers import BertTokenizer
import torch


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_reviews(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
)

# Apply tokenization to the reviews
df['data'] = df['reviewText'].apply(tokenize_reviews)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# splitting data into test and trainingsdata to measure performance later
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)

In [31]:
def pad_tensor(dict_item):
    input_ids = dict_item['input_ids']
    attention_mask = dict_item['attention_mask']
    
    padded_input_ids = torch.cat([input_ids, torch.zeros(1, max_len - input_ids.shape[1])], dim=1)
    padded_attention_mask = torch.cat([attention_mask, torch.zeros(1, max_len - attention_mask.shape[1])], dim=1)
    
    dict_item['input_ids'] = padded_input_ids
    dict_item['attention_mask'] = padded_attention_mask
    
    return dict_item

train['data'] = train['data'].apply(pad_tensor)
# If max_len for test data should be the same as that for train data, you can skip this step.
# Otherwise, uncomment and run the line below:
# max_len = max(item['input_ids'].shape[1] for item in test['data'])

# Apply padding to test data
test['data'] = test['data'].apply(pad_tensor)


In [32]:
# Creating PyTorch tensors for training and testing data.
# Concatenating input_ids and attention_masks for each dataset to form tensors.
# Also converting sentiment labels to integer form and forming corresponding tensors.
# This prepares the data in the format required for BERT model training and evaluation.

import torch

train_data = torch.cat([item['input_ids'] for item in train['data']], dim=0)
train_attention_masks = torch.cat([item['attention_mask'] for item in train['data']], dim=0)
train_labels = torch.tensor(train['sentiment'].map({'positive': 2, 'neutral': 1, 'negative': 0}).values)

test_data = torch.cat([item['input_ids'] for item in test['data']], dim=0)
test_attention_masks = torch.cat([item['attention_mask'] for item in test['data']], dim=0)
test_labels = torch.tensor(test['sentiment'].map({'positive': 2, 'neutral': 1, 'negative': 0}).values)
