In [1]:
import pandas as pd

In [4]:
data = pd.read_csv("review-data.csv", index_col=False)
data

Unnamed: 0,reviewText,overall,rating
0,I enjoy vintage books and movies so I enjoyed ...,5,1
1,This book is a reissue of an old one; the auth...,4,1
2,This was a fairly interesting read. It had ol...,4,1
3,I'd never read any of the Amy Brewster mysteri...,5,1
4,"If you like period pieces - clothing, lingo, y...",4,1
...,...,...,...
982592,Yasss hunny! This is a great read. That Dre is...,5,1
982593,I ENJOYED THIS BOOK FROM BEGINNING TO END NOW ...,5,1
982594,Great book! Cherika was a fool. She let that m...,5,1
982595,When I say this was an excellent book please b...,5,1


In [6]:
# Preprocessing the data
data['reviewText'] = data.reviewText.str.lower()
data.head(2)

Unnamed: 0,reviewText,overall,rating
0,i enjoy vintage books and movies so i enjoyed ...,5,1
1,this book is a reissue of an old one; the auth...,4,1


In [None]:
# removing special characters 
import re

data['reviewText'] = data['reviewText'].apply(lambda x: re.sub('[^a-zA-z0-9]'," ",x))
data['reviewText']


0         i enjoy vintage books and movies so i enjoyed ...
1         this book is a reissue of an old one  the auth...
2         this was a fairly interesting read   it had ol...
3         i d never read any of the amy brewster mysteri...
4         if you like period pieces   clothing  lingo  y...
                                ...                        
982592    yasss hunny  this is a great read  that dre is...
982593    i enjoyed this book from beginning to end now ...
982594    great book  cherika was a fool  she let that m...
982595    when i say this was an excellent book please b...
982596    this book was everything  i just hope alexus w...
Name: reviewText, Length: 982597, dtype: object

In [9]:
import nltk
from nltk.corpus import stopwords

In [18]:
# Removing stopwords
stop_words = set(stopwords.words('english'))
data['reviewText'] = data['reviewText'].str.split().apply(lambda x: " ".join([word for word in x if word not in stop_words]))
data['reviewText']

0         enjoy vintage books movies enjoyed reading boo...
1         book reissue old one author born 1910 era say ...
2         fairly interesting read old style terminology ...
3         never read amy brewster mysteries one really h...
4         like period pieces clothing lingo enjoy myster...
                                ...                        
982592    yasss hunny great read dre mess cherika refuse...
982593    enjoyed book beginning end far lex hoe sneaky ...
982594    great book cherika fool let man get away much ...
982595    say excellent book please believe definitely p...
982596    book everything hope alexus wise move lawd tho...
Name: reviewText, Length: 982597, dtype: object

In [21]:
data[data['reviewText'].str.contains('www', case=False, na=False)]

Unnamed: 0,reviewText,overall,rating
556,even though giving blog three stars going keep...,3,1
618,reading reviews purchasing first series shocke...,3,1
3146,http www amazon com gp product b001u0q17k ref ...,5,1
5699,jake mcbride self made millionaire womanizer e...,5,1
6046,http www amazon com gp product b002hrf7wk ref ...,5,1
...,...,...,...
981652,jjjjjjoooooooooeeeeeeeeeee yep backstory going...,5,1
981801,despite would class girly book 5 read enjoyed ...,5,1
982323,come back second book dirty dark deadly series...,5,1
982460,find www morningbooksandcoffee comc p smith to...,5,1


In [24]:
# Remove urls
data['reviewText'] = data['reviewText'].apply(lambda x: re.sub(r'http\S+|www\.\S+|\S+\.\S+','',x))

In [29]:
# Removing html tags
from bs4 import BeautifulSoup
data['reviewText'] = data.reviewText.apply(lambda x: BeautifulSoup(x, "html.parser").get_text())

In [31]:
data.reviewText = data.reviewText.apply(lambda x: " ".join(x.split()))

In [32]:
data.reviewText

0         enjoy vintage books movies enjoyed reading boo...
1         book reissue old one author born 1910 era say ...
2         fairly interesting read old style terminology ...
3         never read amy brewster mysteries one really h...
4         like period pieces clothing lingo enjoy myster...
                                ...                        
982592    yasss hunny great read dre mess cherika refuse...
982593    enjoyed book beginning end far lex hoe sneaky ...
982594    great book cherika fool let man get away much ...
982595    say excellent book please believe definitely p...
982596    book everything hope alexus wise move lawd tho...
Name: reviewText, Length: 982597, dtype: object

In [33]:
# lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [34]:
data['reviewText'] = data['reviewText'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))

In [36]:
data['reviewText'] = data['reviewText'].apply(lambda x: lemmatizer.lemmatize(x))

In [37]:
data['reviewText']

0         enjoy vintage book movie enjoyed reading book ...
1         book reissue old one author born 1910 era say ...
2         fairly interesting read old style terminology ...
3         never read amy brewster mystery one really hooked
4         like period piece clothing lingo enjoy mystery...
                                ...                        
982592    yasss hunny great read dre mess cherika refuse...
982593    enjoyed book beginning end far lex hoe sneaky ...
982594    great book cherika fool let man get away much ...
982595    say excellent book please believe definitely p...
982596    book everything hope alexus wise move lawd tho...
Name: reviewText, Length: 982597, dtype: object

In [38]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['reviewText'], data['rating'], test_size=0.30, random_state=42)

In [40]:
data.to_csv("cleaned-text-after-preprocessing.csv", index=False)