In [1]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the CSV file while skipping bad lines
df = pd.read_csv('/content/kindle_reviews.csv', engine='python', on_bad_lines='skip')

In [3]:
#Information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 982618 entries, 0 to 982617
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Unnamed: 0      982618 non-null  int64 
 1   asin            982618 non-null  object
 2   helpful         982618 non-null  object
 3   overall         982618 non-null  int64 
 4   reviewText      982596 non-null  object
 5   reviewTime      982618 non-null  object
 6   reviewerID      982618 non-null  object
 7   reviewerName    978796 non-null  object
 8   summary         982499 non-null  object
 9   unixReviewTime  982618 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 75.0+ MB


In [4]:
#Select Specific Columns
if 'reviewText' in df.columns and 'overall' in df.columns:
    df = df[['reviewText', 'overall']]
else:
    print("Columns 'reviewText' and/or 'overall' do not exist in the DataFrame")

In [5]:
#Rows and columns of the dataset
df.shape

(982618, 2)

In [6]:
#Number of null values in the dataset
df.isna().sum()

Unnamed: 0,0
reviewText,22
overall,0


In [7]:
#Unique values
df['overall'].unique()

array([5, 4, 3, 2, 1])

In [8]:
df['overall'].value_counts()

Unnamed: 0_level_0,count
overall,Unnamed: 1_level_1
5,575263
4,254013
3,96194
2,34130
1,23018


In [9]:
#Pre processing and cleaning
#positive review is 1 and negative review is 0
df['overall']= df['overall'].apply(lambda x:1 if x<3 else 1)
df.head()

Unnamed: 0,reviewText,overall
0,I enjoy vintage books and movies so I enjoyed ...,1
1,This book is a reissue of an old one; the auth...,1
2,This was a fairly interesting read. It had ol...,1
3,I'd never read any of the Amy Brewster mysteri...,1
4,"If you like period pieces - clothing, lingo, y...",1


In [10]:
df['overall'].unique()

array([1])

In [11]:
df['overall'].value_counts()

Unnamed: 0_level_0,count
overall,Unnamed: 1_level_1
1,982618


In [12]:
#Lower all the cases
df['reviewText'] = df['reviewText'].str.lower()

In [13]:
df.head()

Unnamed: 0,reviewText,overall
0,i enjoy vintage books and movies so i enjoyed ...,1
1,this book is a reissue of an old one; the auth...,1
2,this was a fairly interesting read. it had ol...,1
3,i'd never read any of the amy brewster mysteri...,1
4,"if you like period pieces - clothing, lingo, y...",1


In [15]:
# Clean the data by removing special characters
import re
from bs4 import BeautifulSoup

df['reviewText'] = df['reviewText'].astype(str).apply(lambda x:re.sub('[^a-z A-Z 0-9]+','',x))


In [None]:
#removing the stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
df['reviewText']  = df['reviewText'].apply(lambda x:" ".join([word for word in x.split() if word not in stopwords.words('english')]))

In [None]:
#Remove the Url()
df['reviewText'] = df['reviewText'].apply(lambda x:re.sub(r'(http|https|ftp|ssh)://([\w_]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&/~+#-])?', '', str(x))) # Added opening parenthesis before http
#Remove the html tags
import bs4 # changed to bs4
df['reviewText'] = df['reviewText'].apply(lambda x: bs4.BeautifulSoup(x, 'lxml').get_text()) # changed to bs4
#Remove any additional spaces
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x.split()))

In [None]:
df.head()

Unnamed: 0,reviewText,overall
0,ienjoyvintagebooksandmoviessoienjoyedreadingth...,1
1,thisbookisareissueofanoldonetheauthorwasbornin...,1
2,thiswasafairlyinterestingreadithadoldstyleterm...,1
3,idneverreadanyoftheamybrewstermysteriesuntilth...,1
4,ifyoulikeperiodpiecesclothinglingoyouwillenjoy...,1
