<a href="https://colab.research.google.com/github/banned-books/project_banned_books/blob/main/unsupervised_topic_modeling/Clean_Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Mount GDrive


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import linalg
import gensim
from tqdm import tqdm
import re
import string
import matplotlib.pyplot as plt
import nltk
import matplotlib
import scattertext as st
import seaborn as sns
nltk.download('punkt')
nltk.download('stopwords')

%matplotlib inline
np.set_printoptions(suppress=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Import Banned Book Data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/original_data/banned_books.csv')
df['origin_of_challenge'] = df['origin_of_challenge'].replace(['Other'], 'Administrator')

## Import Amazon.com Review Data

#### Description of columns in the file:

- product_name - name of book + author
- title - title of book review
- body - text of the review
- rating - rating of the book review
- verified_purchase - did the reviewer buy the book or not?
- review_date - the date of the review

In [None]:
raw_reviews = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/original_data/scraped_amz_reviews.csv')
raw_reviews.head(3)

In [None]:
raw_reviews.info()

## Pre-Process Banned Books Metadata


In [None]:
def topnwords(documents, mode, n):
    """
    returns list with top n most important/popular words using 
    tf-idf or count
    """
    if mode == "count":
        obj = CountVectorizer(lowercase=True, stop_words="english")
        word_occ = obj.fit_transform(documents)

    elif mode == "tf-idf":
        obj = TfidfVectorizer(lowercase=True, stop_words="english")
        word_occ = obj.fit_transform(documents)

    text_features = pd.DataFrame(word_occ.toarray(), columns=obj.get_feature_names_out())

    # Get the top 10 words per book
    topwords = text_features.apply(
        lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=n
    )

    return topwords.values.tolist()

def filter_common_words(words):
    common_words = [
        "desmond",
        "new", 
        "york",
        "times",
        "bestselling"
    ]
    return [word for word in words if word not in common_words]

def preprocess_banned_books(df):
    """
    This function does the text preprocessing
    :return df
    """
    # put all characters in lower case
    df['text'] = df['title'] + ' ' + df['goodreads_tags'] + ' ' + df['goodreads_description']
    df['text'] = df["text"].str.lower()

    # tokenization
    df["Tokens"] = df["text"].apply(lambda x: nltk.word_tokenize(str(x)))

    # remove stop words and non-alphabetic from all the text
    sw = nltk.corpus.stopwords.words("english")

    df["Tokens"] = df["Tokens"].apply(
        lambda x: [w for w in x if (w not in sw) and w.isalpha()]
    )
    
    df['Tokens'] = df['Tokens'].apply(filter_common_words)
    df['Joined_Tokens']= df['Tokens'].apply(lambda x: " ".join(x))
    df = df.sort_values(by = ['ban_date']).reset_index(drop = True)

    # create top 10 words per book using tf-idf score
    top_10_words = topnwords(df['Joined_Tokens'],  "tf-idf", n=10)
    top_10_words = pd.DataFrame({'top_10_words':top_10_words})
    df = pd.concat([df,top_10_words], axis = 1)

    # create a scattertext object for Scattertext visualization
    df['parse'] = df.Joined_Tokens.apply(st.whitespace_nlp_with_sentences)

    return df

In [None]:
clean_df = preprocess_banned_books(df)

In [None]:
clean_df.head(1)

## Pre-Process Amazon.com Reviews Data

#### About the data
- We have a total of 380,639 reviews after removing duplicates below.
- We have 3,594 duplicated reviews.

In [None]:
def clean_reviews():
  """Pre-process reviews."""

  # Make a copy of the original dataframe
  clean_data = raw_reviews.copy()

  # Drop the duplicated reviews
  clean_data.drop_duplicates(inplace = True)

  # Concatenate the title and body (keep text column) and clean review date column
  # clean_data = clean_data[['title', 'body', 'review_date']]
  clean_data['reviews'] = clean_data['title'] + " " + clean_data['body']
  clean_data = clean_data.drop(['title', 'body'], axis=1)
  clean_data['reviews'] = clean_data['reviews'].astype(str)
  clean_data['review_date'] = clean_data['review_date'].apply(pd.to_datetime)

  # Lower case
  clean_data['pre_process'] = clean_data['reviews'].apply(lambda x: str(x).lower()) 

  # Remove numbers
  clean_data['pre_process'] = clean_data['pre_process'].apply(lambda x: re.sub(r'\d+','', x))

  # Remove extra spaces
  clean_data['pre_process'] = clean_data['pre_process'].apply(lambda x: re.sub(' +', ' ', x))
  clean_data['pre_process'] = clean_data['pre_process'].apply(lambda x: x.strip())

  # Remove punctuation
  clean_data['pre_process'] = clean_data['pre_process'].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))

  # Grab the rating string
  clean_data['rating'] = clean_data['rating'].astype(str).str[:1]

  # Drop reviews without ratings
  clean_data = clean_data[clean_data.rating != 'n']

  # Cast rating as integer
  clean_data['rating'] = clean_data['rating'].astype(int)

  # Convert verified purhcase to yes or no
  clean_data['verified_purchase'] = clean_data['verified_purchase'].map({True: 'Yes', False: 'No'}) 

  # Convert date to datetime
  clean_data['date'] = pd.to_datetime(clean_data['review_date'])

  return clean_data

def contractions(s):
  """Replace contractions.""" 
  s = re.sub(r"won’t", "will not",s)
  s = re.sub(r"would’t", "would not",s)
  s = re.sub(r"could’t", "could not",s)
  s = re.sub(r"\’d", " would",s)
  s = re.sub(r"can\’t", "can not",s)
  s = re.sub(r"n\’t", " not", s)
  s = re.sub(r"\’re", " are", s)
  s = re.sub(r"\’s", " is", s)
  s = re.sub(r"\’ll", " will", s)
  s = re.sub(r"\’t", " not", s)
  s = re.sub(r"\’ve", " have", s)
  s = re.sub(r"\’m", " am", s)

  return s

def final_clean_reviews(text):
    '''
    Make text lowercase, remove text in square brackets, remove links, remove punctuation
    and remove words containing numbers.
    '''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

In [None]:
# Clean data
clean_data = clean_reviews()

# Replace contractions
clean_data['pre_process'] = clean_data['reviews'].apply(lambda x: contractions(x))

# Remove any remaining non-alpha characters
clean_data['pre_process'] = clean_data['pre_process'].apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))

# Remove any remaining brackets, links, punctuation, words, numbers
clean_data['pre_process'] = clean_data['pre_process'].apply(lambda x: final_clean_reviews(x))

In [None]:
clean_data.head()

In [None]:
# Checking for null values
clean_data.isnull().sum()

### Split Amazon Reviews into two temporal dataframes
- Before the first book ban (one year prior - July 1, 2020 to June 30, 2021)
- After the first book ban (July 1, 2021 and after until the dataset ends on October 12, 2022)

In [None]:
split_date ='2021-07-01'
before_ban_amz_df = clean_data.loc[clean_data['review_date'] < split_date]
after_ban_amz_df = clean_data.loc[clean_data['review_date'] >= split_date]

In [None]:
split_date ='2020-07-01'
before_ban_amz_df = before_ban_amz_df.loc[before_ban_amz_df['review_date'] >= split_date]

In [None]:
before_ban_amz_df.head()

In [None]:
after_ban_amz_df.head()

Unnamed: 0,product_name,rating,verified_purchase,review_date,reviews,pre_process,date
2,Ace Spades Faridah Abike Iyimide,5,Yes,2022-09-12,Wow I ordered this book for my teenage daughte...,wow i ordered this book for my teenage daughte...,2022-09-12
3,Ace Spades Faridah Abike Iyimide,4,Yes,2022-08-10,"Definitely a YA novel Great plot, childish cha...",definitely a ya novel great plot childish cha...,2022-08-10
4,Ace Spades Faridah Abike Iyimide,5,Yes,2021-07-05,A most timely book. This a very engrossing sto...,a most timely book this a very engrossing sto...,2021-07-05
5,Ace Spades Faridah Abike Iyimide,4,Yes,2021-07-03,Taut thriller Two students at a ritzy preparat...,taut thriller two students at a ritzy preparat...,2021-07-03
6,Ace Spades Faridah Abike Iyimide,4,Yes,2021-07-20,A dark thriller that delves into racism Devon ...,a dark thriller that delves into racism devon ...,2021-07-20


## Pickle/Save Cleaned Dataframes

In [None]:
# Save cleaned banned book data
clean_df.to_pickle('/content/drive/MyDrive/Colab Notebooks/data/cleaned_trained_data/cleaned_topic_modeling.pkl')

In [None]:
# Save reviews
before_ban_amz_df.to_pickle('/content/drive/MyDrive/Colab Notebooks/data/cleaned_trained_data/before_ban_amazon_review_data.pkl')
after_ban_amz_df.to_pickle('/content/drive/MyDrive/Colab Notebooks/data/cleaned_trained_data/after_ban_amazon_review_data.pkl')
clean_data.to_pickle('/content/drive/MyDrive/Colab Notebooks/data/cleaned_trained_data/all_amazon_review_data.pkl')