In [1]:
import nltk
import pandas as pd
import string
import tqdm

from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Yuheng_Lu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load raw data

In [2]:
reviews_df = pd.read_json("../data/raw/goodreads_reviews_spoiler.json.gz", compression='infer', lines=True)

In [3]:
reviews_df.shape

(1378033, 7)

In [4]:
reviews_df = reviews_df.iloc[:400000] # the full dataset is too large, we take around 1/3 of the original dataset

## Preprocess raw data

In [5]:
for index, review in tqdm.tqdm(reviews_df.iterrows(), total=reviews_df.shape[0]):
    processed_sentences = []
    for sentence in review.review_sentences:
        processed_text = ''
        for word in sentence[1].split():
            clean_word = word.lower().translate(str.maketrans('', '', string.punctuation))
            clean_word = clean_word.replace('“', '').replace('”', '').replace('’', '').replace('…', '').replace('—', '')
            if clean_word not in stopwords.words('english'):
                stemmed_word = nltk.PorterStemmer().stem(clean_word)
                if stemmed_word:
                    processed_text += stemmed_word + ' '
        processed_sentences.append([sentence[0], processed_text.strip()])
    reviews_df.at[index, 'review_sentences'] = processed_sentences

100%|████████████████████████████████████████████████████| 400000/400000 [2:01:44<00:00, 54.76it/s]


## Data transformation

In [6]:
processed_sentences, spoiler_tags = [], []
for index, review in tqdm.tqdm(reviews_df.iterrows(), total=reviews_df.shape[0]):
    for i in range(len(review.review_sentences)):
        processed_sentences.append(review.review_sentences[i][1])
        spoiler_tags.append(review.review_sentences[i][0])
review_context_df = pd.DataFrame({
    "Processed_Sentence": processed_sentences,
    "Is_Spoiler": spoiler_tags
})
review_context_df.head()

100%|████████████████████████████████████████████████████| 400000/400000 [00:55<00:00, 7253.31it/s]


Unnamed: 0,Processed_Sentence,Is_Spoiler
0,special book,0
1,start slow first third middl third start get i...,0
2,love good scienc fiction push think thing go,0
3,2015 hugo winner translat origin chines made i...,0
4,instanc intermix chines revolutionari histori ...,0


## Save Processed Data

In [7]:
review_context_df.to_json("../data/processed/review_context_final.json.gz", orient="records", lines=True, compression="infer")

In [2]:
test = pd.read_json("../data/raw/goodreads_reviews_spoiler.json.gz", compression='infer', lines=True)

In [3]:
test.head()

Unnamed: 0,user_id,timestamp,review_sentences,rating,has_spoiler,book_id,review_id
0,8842281e1d1347389f2ab93d60773d4d,2017-08-30,"[[0, This is a special book.], [0, It started ...",5,True,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb
1,8842281e1d1347389f2ab93d60773d4d,2017-03-22,"[[0, Recommended by Don Katz.], [0, Avail for ...",3,False,16981,a5d2c3628987712d0e05c4f90798eb67
2,8842281e1d1347389f2ab93d60773d4d,2017-03-20,"[[0, A fun, fast paced science fiction thrille...",3,True,28684704,2ede853b14dc4583f96cf5d120af636f
3,8842281e1d1347389f2ab93d60773d4d,2016-11-09,"[[0, Recommended reading to understand what is...",0,False,27161156,ced5675e55cd9d38a524743f5c40996e
4,8842281e1d1347389f2ab93d60773d4d,2016-04-25,"[[0, I really enjoyed this book, and there is ...",4,True,25884323,332732725863131279a8e345b63ac33e
