##Text Preprocessing

Importing Dataset and libraries

In [64]:
# downloading and updating libraries and models
!pip install unidecode
!pip install word2number
!pip install contractions
!pip install -U spacy
!pip install -U spacy-lookups-data
!python -m spacy download en_core_web_md

Requirement already up-to-date: spacy in /usr/local/lib/python3.6/dist-packages (2.3.2)
Requirement already up-to-date: spacy-lookups-data in /usr/local/lib/python3.6/dist-packages (0.3.2)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [65]:
#Importing libraries
import pandas as pd # data analysis library 
from bs4 import BeautifulSoup # for removing html tags
import spacy # NLP library
import re # regular expressions library
import unidecode # converting accented text to ASCII characters
import inflect # converting numbers to words
from word2number import w2n # converting words to numbers
import contractions # handling contractions
from nltk.stem import PorterStemmer #stemming
stemmer=PorterStemmer()
# load spacy model, can be "en_core_web_sm" as well
nlp = spacy.load("en_core_web_md")

#Loading dataset
#Dataset taken from https://www.kaggle.com/PromptCloudHQ/amazon-reviews-unlocked-mobile-phones
dataset= pd.read_csv('/content/drive/My Drive/Colab Notebooks/Amazon_Unlocked_Mobile.csv')

In [66]:
dataset.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [67]:
dataset.shape

(413840, 6)

Text Preprocessing

In [68]:
# Checking for null values
dataset.isna().sum()

Product Name        0
Brand Name      65171
Price            5933
Rating              0
Reviews            62
Review Votes    12296
dtype: int64

In [69]:
# Creating dataset containing only Ratings and Reviews
dataset = dataset[['Rating','Reviews']]
# Removing rows with null values in our newly created dataset
dataset.dropna(inplace=True)

In [70]:
# checking for null values
dataset.isna().sum()

Rating     0
Reviews    0
dtype: int64

In [71]:
# Labelling the reviews as positive, negative or neutral
def label_review(rating):
    """ inputs:
            rating(int): Rating given by the reviewer
        returns:
            Positive for rating of 4 or 5
            Negative for rating of 1 or 2
            Neutral for rating of 3
        Description: 
            Thw function converts the rating to sentiments positive, negative and neutral
    """                 
    if rating >= 4:
        return 'Positive'
    if rating <= 2:
        return 'Negative'
    else:
        return 'Neutral'

In [72]:
# Creating a new column in our dataset containing the labels generated by 
# label_review() for each row
dataset['Label'] = dataset['Rating'].apply(label_review)

In [73]:
dataset.head(10)

Unnamed: 0,Rating,Reviews,Label
0,5,I feel so LUCKY to have found this used (phone...,Positive
1,4,"nice phone, nice up grade from my pantach revu...",Positive
2,5,Very pleased,Positive
3,4,It works good but it goes slow sometimes but i...,Positive
4,4,Great phone to replace my lost phone. The only...,Positive
5,1,I already had a phone with problems... I know ...,Negative
6,2,The charging port was loose. I got that solder...,Negative
7,2,"Phone looks good but wouldn't stay charged, ha...",Negative
8,5,I originally was using the Samsung S2 Galaxy f...,Positive
9,3,It's battery life is great. It's very responsi...,Neutral


In [74]:
def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text  

In [75]:
def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())

In [76]:
def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text


In [77]:
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

In [78]:
def lowercase_text(text):
  """ Converting text to lower case"""
  return text.lower()

In [79]:
def number_words_to_numeric(token):
    """ Converting words representing numbers to numerics"""  
    return w2n.word_to_num(token.text)

In [80]:
def replace_numbers(token):
    """ Replacing numerics with words representing the number"""
    p = inflect.engine()
    return (p.number_to_words(token.text))

In [81]:
def remove_between_brackets(text):
    """ Removing brackets and anything between them"""
    return re.sub('\[[^]]*\]', '', text)

In [82]:
# Removing "not" and "no" from list of stopwords
deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False  

In [83]:
# function to apply all the preprocessing steps on the given text
def text_preprocessing(text, accented_chars=True, contractions=True, 
                       convert_num=True, convert_word = False, extra_whitespace=True, 
                       lemmatization=True, remove_words_with_length_less_than_2=True, lowercase=True, punctuations=True,
                       remove_html=True, remove_text_between_brackets = True, remove_num=True, special_chars=True, 
                       stop_words=True, stemming = False):
    """preprocess text with default option set to true for all steps except stemming and converting numbers to words"""
    if remove_html == True: #remove html tags
        text = strip_html_tags(text)
    if extra_whitespace == True: #remove extra whitespaces
        text = remove_whitespace(text)
    if accented_chars == True: #remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: #expand contractions
        text = replace_contractions(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()
    if remove_text_between_brackets == True: # remove open and close double brackets and anything in between them 
        text=remove_between_brackets(text)   

    doc = nlp(text) #tokenise text

    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
        # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        # remove punctuations
        if punctuations == True and token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        # remove special characters
        if special_chars == True and token.pos_ == 'SYM' and flag == True: 
            flag = False
        # remove numbers
        if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) \
        and flag == True:
            flag = False
        # remove words less having length <=2
        if remove_words_with_length_less_than_2 == True and len(token)<=2 and flag == True:
            flag = False 
        #convert numeric to words
        if convert_word == True and token.text.is_digit() == True and flad == True:
             edit = replace_numbers(token)      
        # convert number words to numeric numbers
        elif convert_num == True and token.pos_ == 'NUM' and flag == True:
            edit = number_words_to_numeric(token.text)
        # convert tokens to base form by stemming
        elif stemming == True and flag == True:
            edit = stemmer.stem(token.text)
        # convert tokens to base form by lemmatization    
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list 
        if edit != "" and flag == True:
            clean_text.append(edit)        
    return clean_text

In [84]:
# Applying preprocessing on first entry of dataset
sample_text= text_preprocessing(dataset['Reviews'][0])
print(dataset['Reviews'][0])
print(sample_text)

I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!
['feel', 'lucky', 'find', 'phone', 'not', 'hard', 'phone', 'line', 'upgrade', 'sell', 'son', 'like', 'old', 'finally', 'fall', 'apart', 'year', 'not', 'want', 'upgrade', 'thank', 'seller', 'appreciate', 'honesty', 'say', 'phone.i', 'recommend', 'seller', 'highly']


In [85]:
# Applying preprocessing on first 100 entries of dataset
preprocessed_reviews = dataset['Reviews'].head(100).apply(text_preprocessing)
preprocessed_reviews

0     [feel, lucky, find, phone, not, hard, phone, l...
1     [nice, phone, nice, grade, pantach, revue, cle...
2                                             [pleased]
3             [work, good, go, slow, good, phone, love]
4     [great, phone, replace, lost, phone, thing, vo...
                            ...                        
95                                                   []
96                         [not, connect, gsm, network]
97    [cell, phone, exceed, expectation, user, frien...
98    [pro, work, fine, easy, use, not, heavy.con, a...
99    [go, lot, review, different, phone, buy, find,...
Name: Reviews, Length: 100, dtype: object