# Read source data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("Review.csv")
data

Unnamed: 0,Review
0,I like this books very much!!! It is VERY INTE...
1,Do not like this book. so boring 2. Too length...


# Remove punctuations and standardize words into lowercases in the documents using string library 

In [3]:
    #for string manipulation
import string

In [4]:
    #constant in string module to remove punctuations
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
    #defining the function to remove punctuations in the documents
    
def remove_punctuation(text):
        punctuation_free = ""
    
        for i in text:
            if i not in string.punctuation:
                punctuation_free += i
    
        return punctuation_free

In [9]:
    #applying the remove_punctuation function to the 'Review' column and storing the result in a new column 'clean_punctuation'
data['clean_punctuation'] = data['Review'].apply(remove_punctuation)
data

Unnamed: 0,Review,clean_punctuation
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy


In [10]:
    #standardize all cases into lower case
data['clean_lower'] = data['clean_punctuation'].str.lower()
data

Unnamed: 0,Review,clean_punctuation,clean_lower
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy


# Remove numbers using re.sub() in regular expression library

In [11]:
import re

In [13]:
    #create a function to remove digit and hypens
def remove_numbers(text):
        return re.sub("[\d-]",'',text)

In [15]:
    #applying the remove_numbers function to the 'clean_lower' column and storing the result in a new column 'clean_number'
data['clean_number'] = data['clean_lower'].apply(remove_numbers)
data

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy


# Break down the words into tokens using nltk library

In [16]:
import nltk #NLP library
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\End
[nltk_data]     User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [17]:
from nltk.tokenize import word_tokenize

In [18]:
data['token_data'] = data['clean_number'].apply(word_tokenize)
data

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver..."
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l..."


# Remove stopwords

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\End
[nltk_data]     User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [20]:
    #get the English stopwords from the library
stopwords = nltk.corpus.stopwords.words('english')

In [21]:
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [22]:
def remove_stopwords(text):
    output = []
    for i in text:
        if i not in stopwords:
            output.append(i)
    return output

In [23]:
data['clean_xstopwords'] = data['token_data'].apply(remove_stopwords)
data

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data,clean_xstopwords
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver...","[like, books, much, interesting]"
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l...","[like, book, boring, lengthy]"


# Perform word stemming using Porter Stemmer

In [24]:
from nltk.stem.porter import PorterStemmer

In [25]:
porter_stemmer = PorterStemmer()

In [26]:
def stemming(text):
    stem_text = []
    for word in text:
        stemmed_word = porter_stemmer.stem(word)
        stem_text.append(stemmed_word)
    return stem_text

In [27]:
data['clean_stemmed'] = data['clean_xstopwords'].apply(stemming)
data

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data,clean_xstopwords,clean_stemmed
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver...","[like, books, much, interesting]","[like, book, much, interest]"
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l...","[like, book, boring, lengthy]","[like, book, bore, lengthi]"


# Perform word lemmatization using WordNetLemmatizer()

In [28]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\End
[nltk_data]     User\AppData\Roaming\nltk_data...


True

In [29]:
from nltk.stem import WordNetLemmatizer

In [30]:
wordnet_lemmatizer = WordNetLemmatizer()

In [31]:
def lemmatizer(text):
    lemm_text = []
    for word in text:
        lemmatized_word = wordnet_lemmatizer.lemmatize(word)
        lemm_text.append(lemmatized_word)
    return lemm_text

In [32]:
data['clean_lemmatized1'] = data['clean_xstopwords'].apply(lemmatizer)

In [33]:
data['clean_lemmatized2'] = data['clean_stemmed'].apply(lemmatizer)
data

Unnamed: 0,Review,clean_punctuation,clean_lower,clean_number,token_data,clean_xstopwords,clean_stemmed,clean_lemmatized1,clean_lemmatized2
0,I like this books very much!!! It is VERY INTE...,I like this books very much It is VERY INTERES...,i like this books very much it is very interes...,i like this books very much it is very interes...,"[i, like, this, books, very, much, it, is, ver...","[like, books, much, interesting]","[like, book, much, interest]","[like, book, much, interesting]","[like, book, much, interest]"
1,Do not like this book. so boring 2. Too length...,Do not like this book so boring 2 Too lengthy,do not like this book so boring 2 too lengthy,do not like this book so boring too lengthy,"[do, not, like, this, book, so, boring, too, l...","[like, book, boring, lengthy]","[like, book, bore, lengthi]","[like, book, boring, lengthy]","[like, book, bore, lengthi]"
