DATA PREPROCESSING

- loaded the dataset and performed random sampling
- preprocessed the dataset; cleaned the text, removed unnecessary columns, handle missing values, and created sentiment labels
- tokenization

In [None]:
!pip install transformers
!pip install torch

In [None]:
#imports
import pandas as pd
import random
import re

from transformers import BertTokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

reviews_path = '/content/drive/My Drive/yelp_academic_dataset_review.json'

#random sampling
desired_sample_size = 30000
approximate_dataset_size = 6000000  #estimate
sampling_fraction = desired_sample_size / approximate_dataset_size

chunk_size = 100000
sampled_df = pd.DataFrame()

for chunk in pd.read_json(reviews_path, lines =True, chunksize=chunk_size):
    sampled_chunk = chunk.sample(frac=sampling_fraction)
    sampled_df = pd.concat([sampled_df, sampled_chunk])

reviews_df = sampled_df.sample(n=30000)


In [None]:
print(reviews_df.shape)

In [None]:
#choose relevant columns
reviews_df = reviews_df[['text', 'stars']]

#drop missing values
reviews_df.dropna(inplace=True)

#lowercasing all the text
reviews_df['text'] = reviews_df['text'].str.lower()

#removing unnecessary information
reviews_df['text'] = reviews_df['text'].apply(lambda x: re.sub('<.*?>', '', x))
reviews_df['text'] = reviews_df['text'].apply(lambda x: re.sub(r'http\S+', '', x))
reviews_df['text'] = reviews_df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

#convert ratings to sentiment labels
reviews_df['sentiment'] = reviews_df['stars'].apply(lambda x: 1 if x > 3 else 0)

reviews_df.head()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 512

def tokenize_map(sentence):

    tokens = tokenizer.encode_plus(sentence, max_length=max_length, truncation=True,
                                   padding='max_length', add_special_tokens=True,
                                   return_tensors='pt')
    return tokens

# Apply the tokenizer to the text
reviews_df['tokens'] = reviews_df['text'].apply(tokenize_map)


In [None]:
input_ids = reviews_df['tokens'].apply(lambda x: x['input_ids'])
attention_masks = reviews_df['tokens'].apply(lambda x: x['attention_mask'])
labels = reviews_df['sentiment']
