# Data Cleaning

In [3]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re
import string
import spacy
import itertools
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
import time
import scipy.sparse
from gensim import matutils,models
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import time
import numpy as np
import multiprocess as mp
import warnings
warnings.filterwarnings("ignore")

<a id="section-two"></a>
# Preprocessing and cleaning


## Dataset

**Description of columns in the file:**
* product_name - name of book + author
* title - title of book review
* body - text of the review
* rating - rating of the book review
* verified_purchase - did the reviewer buy the book or not?
* review_date - the date of the review

In [4]:
raw_reviews = pd.read_csv('../data/scraped_amz_reviews.csv')

In [5]:
raw_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384233 entries, 0 to 384232
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   product_name       384233 non-null  object
 1   title              384213 non-null  object
 2   body               384126 non-null  object
 3   rating             384233 non-null  object
 4   verified_purchase  384233 non-null  bool  
 5   review_date        384233 non-null  object
dtypes: bool(1), object(5)
memory usage: 15.0+ MB


In [6]:
raw_reviews.head()

Unnamed: 0,product_name,title,body,rating,verified_purchase,review_date
0,Ace Spades Faridah Abike Iyimide,"An Engrossing Page Turner About Race, Class an...","Granted, I've never been a fan of Gossip Girl ...",4.0 out of 5 stars,True,"June 6, 2021"
1,Ace Spades Faridah Abike Iyimide,An Interesting Debut,"“Hello, Niveus High. It’s me. Who am I? That’s...",4.0 out of 5 stars,True,"June 14, 2021"
2,Ace Spades Faridah Abike Iyimide,Wow,I ordered this book for my teenage daughter an...,5.0 out of 5 stars,True,"September 12, 2022"
3,Ace Spades Faridah Abike Iyimide,Definitely a YA novel,"Great plot, childish characterizations (althou...",4.0 out of 5 stars,True,"August 10, 2022"
4,Ace Spades Faridah Abike Iyimide,A most timely book.,This a very engrossing story. I was intrigued...,5.0 out of 5 stars,True,"July 5, 2021"


In [7]:
# Make a copy of the original dataframe
clean_data = raw_reviews.copy()

## Check for Duplicates and NaNs
**Let's check for duplicated Amazon.com reviews.**

In [8]:
# Drop the duplicated reviews
clean_data.drop_duplicates(inplace = True)

#### About the data

- We have a total of 380,639 reviews after removing duplicates.
- We have 3,594 duplicated reviews.

## Concatenate the title and body (keep text column) and clean review date column

In [9]:
clean_data = clean_data[['title', 'body', 'review_date']]
clean_data['reviews'] = clean_data['title'] + " " + clean_data['body']
clean_data = clean_data.drop(['title', 'body'], axis=1)
clean_data['reviews'] = clean_data['reviews'].astype(str)
clean_data['review_date'] = clean_data['review_date'].apply(pd.to_datetime)
clean_data.head()

Unnamed: 0,review_date,reviews
0,2021-06-06,"An Engrossing Page Turner About Race, Class an..."
1,2021-06-14,"An Interesting Debut “Hello, Niveus High. It’s..."
2,2022-09-12,Wow I ordered this book for my teenage daughte...
3,2022-08-10,"Definitely a YA novel Great plot, childish cha..."
4,2021-07-05,A most timely book. This a very engrossing sto...


In [10]:
# Save the reviews
clean_data.to_pickle('clean_data_original.pkl')

## Data Cleaning

- There are few cleaning steps that we will perform before moving forward on our reviews:
  1. Lower-case the text
  2. Remove numbers
  3. Remove extra white-spaces(if any)
  4. Remove Punctuation
  5. Remove Stop-words
  6. Lemmatize all words (I prefer lemmatizing instead of stemming - in my final topics, I need coherent words, and not just random words)  
  
* For stop-words, we will be using an iterative list, which we will begin with an extra long list of stop-words from rank.nl (around 600 words), and then keep on adding domain specific terms as and when we counter through building initial topic models.

* We will also create and consider bi-grams and tri-grams in our model to get the best possible set of topics.

In [11]:
# 1. Lower case
clean_data['pre_process'] = clean_data['reviews'].apply(lambda x: str(x).lower())

In [12]:
# 2. Remove numbers
clean_data['pre_process'] = clean_data['pre_process'].apply(lambda x: re.sub(r'\d+','', x))

In [13]:
# 3. Remove extra spaces
clean_data['pre_process'] = clean_data['pre_process'].apply(lambda x: re.sub(' +', ' ', x))
clean_data['pre_process'] = clean_data['pre_process'].apply(lambda x: x.strip())

In [14]:
# 4. Remove punctuation
clean_data['pre_process'] = clean_data['pre_process'].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))

In [15]:
# Remove contractions
def contractions(s):
    s = re.sub(r"won’t", "will not",s)
    s = re.sub(r"would’t", "would not",s)
    s = re.sub(r"could’t", "could not",s)
    s = re.sub(r"\’d", " would",s)
    s = re.sub(r"can\’t", "can not",s)
    s = re.sub(r"n\’t", " not", s)
    s = re.sub(r"\’re", " are", s)
    s = re.sub(r"\’s", " is", s)
    s = re.sub(r"\’ll", " will", s)
    s = re.sub(r"\’t", " not", s)
    s = re.sub(r"\’ve", " have", s)
    s = re.sub(r"\’m", " am", s)
    return s

clean_data['pre_process'] = clean_data['reviews'].apply(lambda x:contractions(x))

In [17]:
## Remove any remaining non-alpha characters
clean_data['pre_process'] = clean_data['pre_process'].apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))

## Lowercase, remove any html tags, remove straggle characters
def clean_reviews(text):
    '''
    Make text lowercase, remove text in square brackets, remove links, remove punctuation
    and remove words containing numbers.
    '''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

clean_data['pre_process'] = clean_data['pre_process'].apply(lambda x: clean_reviews(x))

In [18]:
clean_data.head()

Unnamed: 0,review_date,reviews,pre_process
0,2021-06-06,"An Engrossing Page Turner About Race, Class an...",an engrossing page turner about race class and...
1,2021-06-14,"An Interesting Debut “Hello, Niveus High. It’s...",an interesting debut hello niveus high it is m...
2,2022-09-12,Wow I ordered this book for my teenage daughte...,wow i ordered this book for my teenage daughte...
3,2022-08-10,"Definitely a YA novel Great plot, childish cha...",definitely a ya novel great plot childish char...
4,2021-07-05,A most timely book. This a very engrossing sto...,a most timely book this a very engrossing stor...


In [19]:
# 6. Lemmatize reviews
lemmatizer = WordNetLemmatizer()
clean_data['pre_process'] = clean_data['pre_process'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))

In [20]:
clean_data.head()

Unnamed: 0,review_date,reviews,pre_process
0,2021-06-06,"An Engrossing Page Turner About Race, Class an...",an engrossing page turner about race class and...
1,2021-06-14,"An Interesting Debut “Hello, Niveus High. It’s...",an interesting debut hello niveus high it is m...
2,2022-09-12,Wow I ordered this book for my teenage daughte...,wow i ordered this book for my teenage daughte...
3,2022-08-10,"Definitely a YA novel Great plot, childish cha...",definitely a ya novel great plot childish char...
4,2021-07-05,A most timely book. This a very engrossing sto...,a most timely book this a very engrossing stor...


# Split into two dataframes
1. Before the first book ban (one year prior - July 1, 2020 to June 30, 2022)
2. After the first book ban (July 1, 2021 and after until the dataset ends on October 12, 2022)

In [21]:
split_date ='2021-07-01'
before_ban_df = clean_data.loc[clean_data['review_date'] < split_date]
after_ban_df = clean_data.loc[clean_data['review_date'] >= split_date]

In [22]:
split_date ='2020-07-01'
before_ban_df = before_ban_df.loc[before_ban_df['review_date'] >= split_date]

In [23]:
len(before_ban_df)

37492

In [None]:
# Save reviews
before_ban_df.to_pickle('before_ban_data.pkl')
after_ban_df.to_pickle('after_ban_data.pkl')