In [13]:
import pandas as pd
import regex as re
import nltk
from string import digits, punctuation
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
# from gensim.parsing.preprocessing import STOPWORDS
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

Preprocess text consists of:  
Lowering all letters  
Remove punctuation  
Remove numbers  
(Remove spoiler tags)  
(Remove urls)  
Tokenization  
Remove stopwords  
Stemming  
Lemmatization  

In [35]:
STOPWORDS = set()

with open("stopwords.txt", "r") as f:
    lines = f.readlines()

    for line in lines:
        STOPWORDS.add(line[:-1])

STOPWORDS

{'',
 'a',
 'am',
 'an',
 'and',
 'are',
 'as',
 'at',
 'b',
 'be',
 'because',
 'been',
 'being',
 'between',
 'but',
 'by',
 'c',
 'd',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'e',
 'f',
 'for',
 'from',
 'g',
 'h',
 'had',
 'has',
 'have',
 'having',
 'he',
 'her',
 'hers',
 'herself',
 'hes',
 'him',
 'himself',
 'his',
 'how',
 'however',
 'i',
 'if',
 'im',
 'in',
 'is',
 'it',
 'its',
 'itself',
 'j',
 'k',
 'l',
 'll',
 'm',
 'ma',
 'me',
 'mine',
 'my',
 'myself',
 'n',
 'o',
 'of',
 'on',
 'or',
 'our',
 'ours',
 'ourselves',
 'p',
 'q',
 'r',
 're',
 's',
 'she',
 'shes',
 'so',
 't',
 'that',
 'thatll',
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'these',
 'they',
 'theyre',
 'this',
 'thisll',
 'those',
 'to',
 'u',
 'v',
 've',
 'w',
 'was',
 'we',
 'were',
 'what',
 'where',
 'which',
 'who',
 'whom',
 'whose',
 'why',
 'x',
 'y',
 'you',
 'youd',
 'youll',
 'your',
 'youre',
 'yours',
 'yourself',
 'yourselves',
 'youve'}

In [15]:
def preprocess(text):
    # lower all letters
    text = text.lower()
    
    # remove spoiler tag
    spoil = "** spoiler alert **"
    text = text.replace(spoil, " ")

    # remove urls
    url = re.compile(r"https?://\S+|www\.\S+")
    text = url.sub(r"", text)

    # remove digits
    rem_digits = str.maketrans(digits, " "*len(digits))
    text = text.translate(rem_digits)

    # remove punctuation
    rem_punctuation = str.maketrans(punctuation, " "*len(punctuation))
    text = text.translate(rem_punctuation)
    text = re.sub(r'[^\w\s]',' ', text)

    # word tokenization
    tokens = word_tokenize(text)

    # stemming
    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(word) for word in tokens]

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, pos="v") for word in tokens]

    # remove stopwords
    tokens = [word for word in tokens if not word in STOPWORDS]

    return tokens

In [36]:
text = '  this review can also be found at   i picked up uglies after wondering why it was so popular  my answer mostly lies in the book s eerie similarities to the twilight zone s infamous episode   number    looks just like you    it takes place in a dystopian society where at age     everyone undergoes a complex cosmetic procedure where you are turned into a   pretty   '

tokens = word_tokenize(text)

lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word, pos="v") for word in tokens]

tokens = [word for word in tokens if not word in STOPWORDS]

tokens

['review',
 'can',
 'also',
 'find',
 'pick',
 'up',
 'uglies',
 'after',
 'wonder',
 'popular',
 'answer',
 'mostly',
 'lie',
 'book',
 'eerie',
 'similarities',
 'twilight',
 'zone',
 'infamous',
 'episode',
 'number',
 'look',
 'just',
 'like',
 'take',
 'place',
 'dystopian',
 'society',
 'age',
 'everyone',
 'undergo',
 'complex',
 'cosmetic',
 'procedure',
 'turn',
 'into',
 'pretty']

In [4]:
def preprocess2(text):
    # remove spoiler tag
    spoil = "** spoiler alert **"
    text = text.replace(spoil, " ")

    # remove urls
    url = re.compile(r"https?://\S+|www\.\S+")
    text = url.sub(r"", text)

    # tokenization
    tokens = word_tokenize(text)

    # remove digits and punctuations
    tokens = [word for word in tokens if word.isalpha()]

    # lower all letters
    tokens = [word.lower() for word in tokens]

    # remove stopwords
    # stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if not word in STOPWORDS]

    # stemming
    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(word) for word in tokens]

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, pos="v") for word in tokens]

    return tokens


In [24]:
df = pd.read_csv("all_data/reviews_filtered_2.csv")
df['review_text'] = df['review_text'].astype(str)
df.head()

Unnamed: 0,review_id,rating,review_text
0,705b4be0c87fc1a69fbbd6a12d4973a7,1,Ugh I hated the last book in this series- what...
1,9c8dfa25aa1c02eaa1784558401ada6c,1,"This started out strong, but it went downhill ..."
2,674c63c16f14e97d0d5b92237d061e04,1,"A decently written YA book, but I can't even c..."
3,e59c12c5107de7baeffa922d37f9b862,1,This book simultaneously bored me to death and...
4,6805d23d0e5030a6bb9b6666623bedea,1,"Ugh...I tried, I honestly tried. I'm a huge fa..."


In [25]:
df['text'] = df['review_text'].apply(lambda x: preprocess(x))

In [26]:
df.sample(15)

Unnamed: 0,review_id,rating,review_text,text
1774,3cb27aa527c5fef7cf62672cecedddb1,1,** spoiler alert ** \n This book was so beyond...,"[book, beyond, predictable, fill, with, many, ..."
9963,5bddf71571e1f154a4d997897c540eaa,2,I was disappointed in this last book of the tr...,"[disappoint, last, book, trilogy, keep, want, ..."
980,b0015309577dbf4c4a75b0a4452ce110,1,Not much to say...a fairly interesting idea bu...,"[not, much, say, fairly, interest, idea, nonde..."
3549,14bc61c18f90ddbfb22b2343a68aaa83,1,I'd like to thank Ms. Liz Gruder for providing...,"[like, thank, ms, liz, gruder, provide, with, ..."
9405,3755a2cc215e155fdf77a0a98782e723,2,*CONTAINS SPOILERS* \n I would probably rate t...,"[contain, spoilers, would, probably, rate, boo..."
15345,308c437d1258c56f2fb9983c09c2ed5b,5,OMG!,[omg]
10289,37f990a6fedf10f3371c78d312c597bc,4,"I feel bad giving this book 4 stars, because t...","[feel, bad, give, book, star, author, plagiari..."
1543,b82d916fc51e5b9fde8c197059e9b26c,1,This is a 1.5 review. \n Sigh. \n That's the s...,"[review, sigh, sound, disappointment, when, bo..."
17333,6f379c264308be1274c4c5b875a9aea7,5,It was a bit of a slow start... but the end ma...,"[bite, slow, start, end, make, tear, little, o..."
298,099e26649c44a647e1c96e076274dea2,1,"Dude, this guy is the scuzziest stalker boy ye...","[dude, guy, scuzziest, stalker, boy, yet, mani..."


In [27]:
sentiment_map = {"1": 0, "2": 0, "4":1, "5":1}

df["sentiment"] = df["rating"].astype(str).apply(lambda x: sentiment_map[x])

In [29]:
df = df.rename(columns={"review_text": "review"})
df.head()

Unnamed: 0,review_id,rating,review,text,sentiment
0,705b4be0c87fc1a69fbbd6a12d4973a7,1,Ugh I hated the last book in this series- what...,"[ugh, hat, last, book, series, poor, way, end,...",0
1,9c8dfa25aa1c02eaa1784558401ada6c,1,"This started out strong, but it went downhill ...","[start, out, strong, go, downhill, fairly, qui...",0
2,674c63c16f14e97d0d5b92237d061e04,1,"A decently written YA book, but I can't even c...","[decently, write, ya, book, can, even, conside...",0
3,e59c12c5107de7baeffa922d37f9b862,1,This book simultaneously bored me to death and...,"[book, simultaneously, bore, death, annoy, hel...",0
4,6805d23d0e5030a6bb9b6666623bedea,1,"Ugh...I tried, I honestly tried. I'm a huge fa...","[ugh, try, honestly, try, huge, fan, scott, we...",0


In [31]:
df = df[["review", "text", "rating", "sentiment"]]
df.head()

Unnamed: 0,review,text,rating,sentiment
0,Ugh I hated the last book in this series- what...,"[ugh, hat, last, book, series, poor, way, end,...",1,0
1,"This started out strong, but it went downhill ...","[start, out, strong, go, downhill, fairly, qui...",1,0
2,"A decently written YA book, but I can't even c...","[decently, write, ya, book, can, even, conside...",1,0
3,This book simultaneously bored me to death and...,"[book, simultaneously, bore, death, annoy, hel...",1,0
4,"Ugh...I tried, I honestly tried. I'm a huge fa...","[ugh, try, honestly, try, huge, fan, scott, we...",1,0


In [32]:
df["sentiment"].value_counts()

0    10000
1    10000
Name: sentiment, dtype: int64

In [34]:
df.to_csv("cleaned_data/cleaned_reviews_2.csv", index=False)

In [6]:
################

df = pd.read_csv("all_data/ppt_filtered.csv")
df["review"] = df["review"].astype(str)
df["token"] = df["review"].apply(lambda x: preprocess(x))
df.sample(15)

Unnamed: 0,review,rating,sentiment,token
5222,I found this book quite juvenile and boring. T...,2,0,"[find, book, quite, juvenile, bore, story, onl..."
5530,Dead beautiful had a really promising beginnin...,2,0,"[dead, beautiful, really, promise, begin, star..."
19768,Amazing. I found myself falling in love with t...,5,1,"[amaze, find, fall, in, love, with, book, agai..."
6600,"I didn't have high hopes for this book, but en...",2,0,"[didn, high, hop, book, but, end, up, like, so..."
19506,I cannot imagine trying to put my thoughts and...,5,1,"[can, not, imagine, try, put, thoughts, feel, ..."
15810,A totally awesome thrilling book that kept me ...,5,1,"[totally, awesome, thrill, book, keep, want, k..."
7598,Another Cleary book I just couldn't like. It w...,2,0,"[another, cleary, book, just, couldn, like, wr..."
9253,2.5 stars,2,0,[star]
16703,Um that was AMAZING! \n But I wish David didn'...,5,1,"[um, amaze, but, wish, david, didn, decide, fo..."
5641,This was just as bad as the first. \n I can un...,2,0,"[just, bad, first, can, understand, adam, woul..."


In [8]:
df = df[["review", "token", "rating", "sentiment"]]
df.sample(15)

Unnamed: 0,review,token,rating,sentiment
2601,This book is extremely harmful for people with...,"[book, extremely, harmful, people, with, eat, ...",1,0
1055,I didn't expect I won't like this book. I was ...,"[didn, expect, win, like, book, catch, up, blu...",1,0
15870,A beautiful story! I will post my review on re...,"[beautiful, story, will, post, review, on, rel...",5,1
4205,If I could give this book a negative 10 I woul...,"[could, give, book, negative, would, come, int...",1,0
3323,"This has to stop, seriously. When Neiderman fi...","[stop, seriously, when, neiderman, first, ghos...",1,0
2776,this was a 420 page book that could have been ...,"[page, book, could, write, in, about, immortal...",1,0
4123,Fucking annoying heroine. So stuck up and douc...,"[fuck, annoy, heroine, so, stick, up, douchy, ...",1,0
11044,I loved this book. There is so much adventure ...,"[love, book, there, so, much, adventure, conce...",4,1
9829,This book is also reviewed on my blog Books: A...,"[book, also, review, on, blog, book, true, sto...",2,0
11035,Review to come. \n Content: Clean,"[review, come, content, clean]",4,1


In [10]:
df.to_csv("all_data/ppt_processed.csv", index=False)