In [4]:
import pandas as pd
import regex as re
import nltk
from string import digits, punctuation
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
# from gensim.parsing.preprocessing import STOPWORDS
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

Preprocess text consists of:  
Lowering all letters  
Remove punctuation  
Remove numbers  
(Remove spoiler tags)  
(Remove urls)  
Tokenization  
Remove stopwords  
Stemming  
Lemmatization  

In [2]:
STOPWORDS = set()

with open("stopwords.txt", "r") as f:
    lines = f.readlines()

    for line in lines:
        STOPWORDS.add(line[:-1])

STOPWORDS

{'a',
 'am',
 'an',
 'and',
 'are',
 'as',
 'at',
 'b',
 'be',
 'because',
 'been',
 'being',
 'by',
 'c',
 'd',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'e',
 'f',
 'for',
 'from',
 'g',
 'h',
 'had',
 'has',
 'have',
 'having',
 'he',
 'her',
 'hers',
 'herself',
 'hes',
 'him',
 'himself',
 'his',
 'i',
 'if',
 'im',
 'is',
 'it',
 'its',
 'itself',
 'j',
 'k',
 'l',
 'll',
 'm',
 'ma',
 'me',
 'mine',
 'my',
 'myself',
 'n',
 'o',
 'of',
 'or',
 'our',
 'ours',
 'ourselves',
 'p',
 'q',
 'r',
 're',
 's',
 'she',
 'shes',
 't',
 'that',
 'thatll',
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'these',
 'they',
 'theyre',
 'this',
 'thisll',
 'those',
 'to',
 'u',
 'v',
 've',
 'w',
 'was',
 'we',
 'were',
 'what',
 'where',
 'which',
 'who',
 'whom',
 'why',
 'x',
 'y',
 'you',
 'youd',
 'youll',
 'your',
 'youre',
 'yours',
 'yourself',
 'yourselves',
 'youve',
 'z'}

In [5]:
def preprocess(text):
    # lower all letters
    text = text.lower()
    
    # remove spoiler tag
    spoil = "** spoiler alert **"
    text = text.replace(spoil, " ")

    # remove urls
    url = re.compile(r"https?://\S+|www\.\S+")
    text = url.sub(r"", text)

    # remove digits
    rem_digits = str.maketrans(digits, " "*len(digits))
    text = text.translate(rem_digits)

    # remove punctuation
    rem_punctuation = str.maketrans(punctuation, " "*len(punctuation))
    text = text.translate(rem_punctuation)
    text = re.sub(r'[^\w\s]',' ', text)

    # word tokenization
    tokens = word_tokenize(text)

    # stemming
    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(word) for word in tokens]

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, pos="v") for word in tokens]

    # remove stopwords
    tokens = [word for word in tokens if not word in STOPWORDS]

    return tokens

ex1 = "** spoiler alert ** collective--with"
preprocess(ex1)

['collective', 'with']

In [4]:
def preprocess2(text):
    # remove spoiler tag
    spoil = "** spoiler alert **"
    text = text.replace(spoil, " ")

    # remove urls
    url = re.compile(r"https?://\S+|www\.\S+")
    text = url.sub(r"", text)

    # tokenization
    tokens = word_tokenize(text)

    # remove digits and punctuations
    tokens = [word for word in tokens if word.isalpha()]

    # lower all letters
    tokens = [word.lower() for word in tokens]

    # remove stopwords
    # stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if not word in STOPWORDS]

    # stemming
    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(word) for word in tokens]

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, pos="v") for word in tokens]

    return tokens


In [23]:
df = pd.read_csv("reviews_filtered_2.csv")
df['review_text'] = df['review_text'].astype(str)
df.head()

Unnamed: 0,review_id,rating,review_text
0,705b4be0c87fc1a69fbbd6a12d4973a7,1,Ugh I hated the last book in this series- what...
1,9c8dfa25aa1c02eaa1784558401ada6c,1,"This started out strong, but it went downhill ..."
2,674c63c16f14e97d0d5b92237d061e04,1,"A decently written YA book, but I can't even c..."
3,e59c12c5107de7baeffa922d37f9b862,1,This book simultaneously bored me to death and...
4,6805d23d0e5030a6bb9b6666623bedea,1,"Ugh...I tried, I honestly tried. I'm a huge fa..."


In [24]:
df['text'] = df['review_text'].apply(lambda x: preprocess(x))

In [25]:
df.sample(15)

Unnamed: 0,review_id,rating,review_text,text
9080,cb922b75da1d98ffcd9bbe9ffe96a80e,2,White Crow is a modern gothic novel. It follow...,"[white, crow, modern, gothic, novel, follow, t..."
13254,b2617161a2f82e75f30ae3ff34a81794,4,"wow, spooky and creepy. It's the perfect late ...","[wow, spooky, creepy, perfect, late, night, re..."
12138,b63ce921bd2aa95134a7add2ebe84ef8,4,reread this book as it was one of my childhood...,"[reread, book, one, childhood, favorites, now,..."
8664,b419d68332a885f1d1c3ba35430ef0e5,2,It's been a long time since I read this. I'd a...,"[long, time, since, read, actually, forget, ab..."
1596,e7f9e65c823cb634f38e15c2e3d3e549,1,DNF at 100 pages. \n I wanted to open this rev...,"[dnf, page, want, open, review, say, book, mak..."
11142,7ad1921ecbf3f7c85458be9c3c52c4ee,4,Nov 2010 I thought this book was a little tame...,"[nov, think, book, little, tamer, maybe, becom..."
7379,eecff61a2b5bac70d8dd969d6c148b7f,2,Unfortunately this book was boring. Very littl...,"[unfortunately, book, bore, very, little, happ..."
18887,2c1d81cd2c31fff1eb561868ec6a814a,5,"This book is soooo good. Great writer, great c...","[book, soooo, good, great, writer, great, char..."
15495,e0f10b2dabca78ebf5435eea10dccab1,5,I want to sleep...my eyes hurt....burn actuall...,"[want, sleep, eye, hurt, burn, actually, eye, ..."
19206,8d2ccfd7b30b63e08c96848d714ca526,5,Tear jerking. Slightly cliche. But definitely ...,"[tear, jerk, slightly, cliche, but, definitely..."


In [26]:
sentiment_map = {"1": 0, "2": 0, "4":1, "5":1}

df["sentiment"] = df["rating"].astype(str).apply(lambda x: sentiment_map[x])

In [27]:
df = df.drop(["rating", "review_text"], axis=1)

In [28]:
df["sentiment"].value_counts()

0    10000
1    10000
Name: sentiment, dtype: int64

In [29]:
df.to_csv("cleaned_reviews_2.csv", index=False)

In [6]:
################

df = pd.read_csv("all_data/ppt_filtered.csv")
df["review"] = df["review"].astype(str)
df["token"] = df["review"].apply(lambda x: preprocess(x))
df.sample(15)

Unnamed: 0,review,rating,sentiment,token
5222,I found this book quite juvenile and boring. T...,2,0,"[find, book, quite, juvenile, bore, story, onl..."
5530,Dead beautiful had a really promising beginnin...,2,0,"[dead, beautiful, really, promise, begin, star..."
19768,Amazing. I found myself falling in love with t...,5,1,"[amaze, find, fall, in, love, with, book, agai..."
6600,"I didn't have high hopes for this book, but en...",2,0,"[didn, high, hop, book, but, end, up, like, so..."
19506,I cannot imagine trying to put my thoughts and...,5,1,"[can, not, imagine, try, put, thoughts, feel, ..."
15810,A totally awesome thrilling book that kept me ...,5,1,"[totally, awesome, thrill, book, keep, want, k..."
7598,Another Cleary book I just couldn't like. It w...,2,0,"[another, cleary, book, just, couldn, like, wr..."
9253,2.5 stars,2,0,[star]
16703,Um that was AMAZING! \n But I wish David didn'...,5,1,"[um, amaze, but, wish, david, didn, decide, fo..."
5641,This was just as bad as the first. \n I can un...,2,0,"[just, bad, first, can, understand, adam, woul..."


In [8]:
df = df[["review", "token", "rating", "sentiment"]]
df.sample(15)

Unnamed: 0,review,token,rating,sentiment
2601,This book is extremely harmful for people with...,"[book, extremely, harmful, people, with, eat, ...",1,0
1055,I didn't expect I won't like this book. I was ...,"[didn, expect, win, like, book, catch, up, blu...",1,0
15870,A beautiful story! I will post my review on re...,"[beautiful, story, will, post, review, on, rel...",5,1
4205,If I could give this book a negative 10 I woul...,"[could, give, book, negative, would, come, int...",1,0
3323,"This has to stop, seriously. When Neiderman fi...","[stop, seriously, when, neiderman, first, ghos...",1,0
2776,this was a 420 page book that could have been ...,"[page, book, could, write, in, about, immortal...",1,0
4123,Fucking annoying heroine. So stuck up and douc...,"[fuck, annoy, heroine, so, stick, up, douchy, ...",1,0
11044,I loved this book. There is so much adventure ...,"[love, book, there, so, much, adventure, conce...",4,1
9829,This book is also reviewed on my blog Books: A...,"[book, also, review, on, blog, book, true, sto...",2,0
11035,Review to come. \n Content: Clean,"[review, come, content, clean]",4,1


In [10]:
df.to_csv("all_data/ppt_processed.csv", index=False)