In [2]:
%pip install nltk

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3 (from nltk)
  Obtaining dependency information for regex>=2021.8.3 from https://files.pythonhosted.org/packages/63/78/ed291d95116695b8b5d7469a931d7c2e83d942df0853915ee504cee98bcf/regex-2023.8.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached regex-2023.8.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Using cached regex-2023.8.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (758 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.8.8
Note: you may need to restart the kernel to use updated packages.


In [3]:
import gzip
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import os
import pandas as pd
import pickle
import string
from google.cloud import storage
import io

First we import our data from a GCS bucket.

In [4]:
bucket_name = 'bf-review-nlp'
blob_name = 'raw_data/Digital_Music.json.gz'

# Initialize GCS client and get the blob
client = storage.Client()
bucket = client.bucket(bucket_name)
blob = bucket.blob(blob_name)

json_gz_bytes = blob.download_as_bytes()
json_gz_file = io.BytesIO(json_gz_bytes)

with gzip.open(json_gz_file, 'rb') as f:
    df = pd.read_json(f, lines=True)


In [6]:
# Drop unnecessary columns for this project.
df = df.drop(["reviewTime", "reviewerID", "asin", "reviewerName", "summary", "unixReviewTime", "vote", "image"], axis=1)

# Check NAs and column types.
df.info()

# Drop rows with no review text.
df = df.dropna(subset=["reviewText"])

# Check all scores are values 1-5. We convert to categorical because the scores aren't continuous.
df["overall"].unique()
df["overall"] = df['overall'].astype('category')

# Check style values.
# We clean up the formatting with regex. This also conveniently converts "nan" strings into proper nans.
df["style"] = df["style"].astype(str)
# Discard the 'format' prefix and any non-alphabetic characters, keep any text following the first alphabetic character up to the '} suffix.
df["style"] = df["style"].str.extract(r"{'Format[^a-zA-Z]*([a-zA-Z].*?)'}")
styles = df["style"].value_counts(dropna=False)
# There's a few (small) categories that seem unlikely to be "digital music". Let's drop them.
drop_categories = ["Paperback", "Hardcover", "Kindle Edition", "USB Memory Stick", "Accessory", "Health and Beauty", "Calendar",
                   "Unknown Binding", "Spiral-bound", "Mass Market Paperback", "Kitchen", "Apparel", "Personal Computers",
                   "Office Product", "Grocery", "Unbound", "Audible Audiobook", "Perfect Paperback", "Misc. Supplies", "Home"]
df = df.loc[~df["style"].isin(drop_categories)]

# Don't need to check verified status unique values since we've already seen it's bool.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1584082 entries, 0 to 1584081
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   overall     1584082 non-null  int64 
 1   verified    1584082 non-null  bool  
 2   style       1310814 non-null  object
 3   reviewText  1582629 non-null  object
dtypes: bool(1), int64(1), object(2)
memory usage: 37.8+ MB


In [10]:
# Convert review text to lower case.
df["reviewText"] = df["reviewText"].str.lower()
# Remove punctuation
df['reviewText'] = df['reviewText'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))


# Remove stop words.
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)


stop_words = set(stopwords.words('english'))
df['reviewText'] = df['reviewText'].apply(remove_stopwords)

# Apply stemming to simplify text further.
stemmer = PorterStemmer()
df['stemmedText'] = df['reviewText'].apply(lambda x: stemmer.stem(x))

In [11]:
pickled_df = pickle.dumps(df)
blob_name = 'processed_data/processed_df.pickle'

# Create a blob and upload the pickled DataFrame
blob = bucket.blob(blob_name)
blob.upload_from_string(pickled_df)

print(f'Pickled DataFrame uploaded to {blob_name} in {bucket_name}')

Pickled DataFrame uploaded to processed_data/processed_df.pickle in bf-review-nlp
