##Text Preprocessing

Importing Dataset and libraries

In [1]:
# downloading and updating libraries and models
!pip install unidecode
!pip install word2number
!pip install contractions
!pip install -U spacy
!pip install -U spacy-lookups-data
!python -m spacy download en
!python -m spacy download en_core_web_md

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |█▍                              | 10kB 28.8MB/s eta 0:00:01[K     |██▊                             | 20kB 3.0MB/s eta 0:00:01[K     |████▏                           | 30kB 3.7MB/s eta 0:00:01[K     |█████▌                          | 40kB 4.1MB/s eta 0:00:01[K     |██████▉                         | 51kB 3.6MB/s eta 0:00:01[K     |████████▎                       | 61kB 4.0MB/s eta 0:00:01[K     |█████████▋                      | 71kB 4.4MB/s eta 0:00:01[K     |███████████                     | 81kB 4.8MB/s eta 0:00:01[K     |████████████▍                   | 92kB 4.9MB/s eta 0:00:01[K     |█████████████▊                  | 102kB 4.9MB/s eta 0:00:01[K     |███████████████▏                | 112kB 4.9MB/s eta 0:00:01[K     |████████████████▌               | 122kB 4.9MB/

In [2]:
#Importing libraries
import pandas as pd # data analysis library 
from bs4 import BeautifulSoup # for removing html tags
import spacy # NLP library
import re # regular expressions library
import unidecode # converting accented text to ASCII characters
import inflect # converting numbers to words
from word2number import w2n # converting words to numbers
import contractions # handling contractions
from nltk.stem import PorterStemmer #stemming
stemmer=PorterStemmer()
# load spacy model, can be "en_core_web_sm" as well
nlp = spacy.load("en_core_web_sm")

#Loading dataset
#Dataset taken from https://www.kaggle.com/PromptCloudHQ/amazon-reviews-unlocked-mobile-phones
dataset= pd.read_csv('/content/drive/My Drive/Colab Notebooks/Amazon_Unlocked_Mobile.csv')

In [3]:
dataset.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [4]:
dataset.shape

(413840, 6)

Text Preprocessing

In [5]:
# Checking for null values
dataset.isna().sum()

Product Name        0
Brand Name      65171
Price            5933
Rating              0
Reviews            62
Review Votes    12296
dtype: int64

In [6]:
# Creating dataset containing only Ratings and Reviews
dataset = dataset[['Rating','Reviews']]
# Removing rows with null values in our newly created dataset
dataset.dropna(inplace=True)

In [7]:
# checking for null values
dataset.isna().sum()

Rating     0
Reviews    0
dtype: int64

In [8]:
# Labelling the reviews as positive, negative or neutral
def label_review(rating):
    """ inputs:
            rating(int): Rating given by the reviewer
        returns:
            Positive for rating of 4 or 5
            Negative for rating of 1 or 2
            Neutral for rating of 3
        Description: 
            Thw function converts the rating to sentiments positive, negative and neutral
    """                 
    if rating >= 4:
        return 'Positive'
    if rating <= 2:
        return 'Negative'
    else:
        return 'Neutral'

In [9]:
# Creating a new column in our dataset containing the labels generated by 
# label_review() for each row
dataset['Label'] = dataset['Rating'].apply(label_review)

In [10]:
dataset.head(10)

Unnamed: 0,Rating,Reviews,Label
0,5,I feel so LUCKY to have found this used (phone...,Positive
1,4,"nice phone, nice up grade from my pantach revu...",Positive
2,5,Very pleased,Positive
3,4,It works good but it goes slow sometimes but i...,Positive
4,4,Great phone to replace my lost phone. The only...,Positive
5,1,I already had a phone with problems... I know ...,Negative
6,2,The charging port was loose. I got that solder...,Negative
7,2,"Phone looks good but wouldn't stay charged, ha...",Negative
8,5,I originally was using the Samsung S2 Galaxy f...,Positive
9,3,It's battery life is great. It's very responsi...,Neutral


In [11]:
def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text  

In [12]:
def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())

In [13]:
def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text


In [14]:
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

In [15]:
def lowercase_text(text):
  """ Converting text to lower case"""
  return text.lower()

In [16]:
def number_words_to_numeric(token):
    """ Converting words representing numbers to numerics"""  
    return w2n.word_to_num(token.text)

In [17]:
def replace_numbers(token):
    """ Replacing numerics with words representing the number"""
    p = inflect.engine()
    return (p.number_to_words(token.text))

In [18]:
def remove_between_brackets(text):
    """ Removing brackets and anything between them"""
    return re.sub('\[[^]]*\]', '', text)

In [19]:
# Removing "not" and "no" from list of stopwords
deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False  

In [20]:
# function to apply all the preprocessing steps on the given text
def text_preprocessing(text, accented_chars=True, contractions=True, 
                       convert_num=True, convert_word = False, extra_whitespace=True, 
                       lemmatization=True, remove_words_with_length_less_than_2=False, lowercase=True, punctuations=True,
                       remove_html=True, remove_text_between_brackets = False, remove_num=True, special_chars=True, 
                       stop_words=False, stemming = False):
    """preprocess text with default option set to true for all steps except stemming and converting numbers to words"""
    if remove_html == True: #remove html tags
        text = strip_html_tags(text)
    if extra_whitespace == True: #remove extra whitespaces
        text = remove_whitespace(text)
    if accented_chars == True: #remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: #expand contractions
        text = replace_contractions(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()
    if remove_text_between_brackets == True: # remove open and close double brackets and anything in between them 
        text=remove_between_brackets(text)   

    doc = nlp(text) #tokenise text

    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
        # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        # remove punctuations
        if punctuations == True and token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        # remove special characters
        if special_chars == True and token.pos_ == 'SYM' and flag == True: 
            flag = False
        # remove numbers
        if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) \
        and flag == True:
            flag = False
        # remove words less having length <=2
        if remove_words_with_length_less_than_2 == True and len(token)<=2 and flag == True:
            flag = False 
        #convert numeric to words
        if convert_word == True and token.text.is_digit() == True and flad == True:
             edit = replace_numbers(token)      
        # convert number words to numeric numbers
        elif convert_num == True and token.pos_ == 'NUM' and flag == True:
            edit = number_words_to_numeric(token.text)
        # convert tokens to base form by stemming
        elif stemming == True and flag == True:
            edit = stemmer.stem(token.text)
        # convert tokens to base form by lemmatization    
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list 
        if edit != "" and flag == True:
            clean_text.append(edit)        
    return clean_text

In [21]:
# Applying preprocessing on first entry of dataset
sample_text='The main use of diacritical marks in the Latin script is to change the sound-values of the letters to which they are added. Examples are the diaereses in the borrowed French words naïve and Noël, which show that the vowel with the diaeresis mark is pronounced separately from the preceding vowel; the acute and grave accents, which can indicate that a final vowel is to be pronounced, as in saké and poetic breathèd; and the cedilla under the "c" in the borrowed French word façade, which shows it is pronounced /s/ rather than /k/.'
preprocessed_text= text_preprocessing(sample_text)
print(sample_text)
print(preprocessed_text)

The main use of diacritical marks in the Latin script is to change the sound-values of the letters to which they are added. Examples are the diaereses in the borrowed French words naïve and Noël, which show that the vowel with the diaeresis mark is pronounced separately from the preceding vowel; the acute and grave accents, which can indicate that a final vowel is to be pronounced, as in saké and poetic breathèd; and the cedilla under the "c" in the borrowed French word façade, which shows it is pronounced /s/ rather than /k/.
['the', 'main', 'use', 'of', 'diacritical', 'mark', 'in', 'the', 'latin', 'script', 'be', 'to', 'change', 'the', 'sound', 'value', 'of', 'the', 'letter', 'to', 'which', 'they', 'be', 'add', 'example', 'be', 'the', 'diaeresis', 'in', 'the', 'borrow', 'french', 'word', 'naive', 'and', 'noel', 'which', 'show', 'that', 'the', 'vowel', 'with', 'the', 'diaeresis', 'mark', 'be', 'pronounce', 'separately', 'from', 'the', 'precede', 'vowel', 'the', 'acute', 'and', 'grave'

In [22]:
# Applying preprocessing on first 100 entries of dataset
preprocessed_reviews=[]
for i in range(100):
    preprocessed_reviews.append(text_preprocessing(dataset['Reviews'][i]))
preprocessed_reviews

[['i',
  'feel',
  'so',
  'lucky',
  'to',
  'have',
  'find',
  'this',
  'use',
  'phone',
  'to',
  'us',
  '&',
  'not',
  'use',
  'hard',
  'at',
  'all',
  'phone',
  'on',
  'line',
  'from',
  'someone',
  'who',
  'upgrade',
  'and',
  'sell',
  'this',
  'one',
  'my',
  'son',
  'like',
  'his',
  'old',
  'one',
  'that',
  'finally',
  'fall',
  'apart',
  'after',
  'year',
  'and',
  'do',
  'not',
  'want',
  'an',
  'upgrade',
  'thank',
  'you',
  'seller',
  'we',
  'really',
  'appreciate',
  'it',
  '&',
  'your',
  'honesty',
  're',
  'say',
  'use',
  'phone.i',
  'recommend',
  'this',
  'seller',
  'very',
  'highly',
  '&',
  'would',
  'but',
  'from',
  'them',
  'again'],
 ['nice',
  'phone',
  'nice',
  'up',
  'grade',
  'from',
  'my',
  'pantach',
  'revue',
  'very',
  'clean',
  'set',
  'up',
  'and',
  'easy',
  'set',
  'up',
  'never',
  'have',
  'an',
  'android',
  'phone',
  'but',
  'they',
  'be',
  'fantastic',
  'to',
  'say',
  'the',


In [23]:
type(preprocessed_reviews[0][0])

str

##Word2vec taking first 500 preprocessed reviews as corpus

In [24]:
#Creating list for corpus
-corpus=[]
# applying text preprocessing(lowercase, remove numbers, 
# remove accented characters, remove special characters,lamatization) while creating corpus
for i in range(500):
    corpus.append(text_preprocessing(dataset['Reviews'][i]))
corpus

[['i',
  'feel',
  'so',
  'lucky',
  'to',
  'have',
  'find',
  'this',
  'use',
  'phone',
  'to',
  'us',
  '&',
  'not',
  'use',
  'hard',
  'at',
  'all',
  'phone',
  'on',
  'line',
  'from',
  'someone',
  'who',
  'upgrade',
  'and',
  'sell',
  'this',
  'one',
  'my',
  'son',
  'like',
  'his',
  'old',
  'one',
  'that',
  'finally',
  'fall',
  'apart',
  'after',
  'year',
  'and',
  'do',
  'not',
  'want',
  'an',
  'upgrade',
  'thank',
  'you',
  'seller',
  'we',
  'really',
  'appreciate',
  'it',
  '&',
  'your',
  'honesty',
  're',
  'say',
  'use',
  'phone.i',
  'recommend',
  'this',
  'seller',
  'very',
  'highly',
  '&',
  'would',
  'but',
  'from',
  'them',
  'again'],
 ['nice',
  'phone',
  'nice',
  'up',
  'grade',
  'from',
  'my',
  'pantach',
  'revue',
  'very',
  'clean',
  'set',
  'up',
  'and',
  'easy',
  'set',
  'up',
  'never',
  'have',
  'an',
  'android',
  'phone',
  'but',
  'they',
  'be',
  'fantastic',
  'to',
  'say',
  'the',


In [27]:
#importing necessary pytorch libraries
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F

In [28]:
# creating vocabulary by adding all the words present in corpus
vocabulary = []
for sentence in corpus:
    for token_text in sentence:
        if token_text not in vocabulary:
            vocabulary.append(token_text)
# creating a dictionaries to get words for index and index for words in our vocabulary
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

In [31]:
#context window size = 2
window_size = 2
# creating list of index of center words and the index of their context words 
idx_pairs = []
# for each sentence
for sentence in corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, treated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make sure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))
#converting into numpy array
idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

In [32]:
idx_pairs
vocabulary_size

2070

In [33]:
# input layer: center word is encoded as one hot vector with dimension [1,vocabulary_size]
def get_input_layer(word_idx):
    """ gives one hot vector representation of word"""
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [35]:
embedding_dims = 10
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True) #hidden layer
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True) #output layer
num_epochs = 100
learning_rate = 0.001
# training our network
for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        #inputs
        x = Variable(get_input_layer(data)).float()
        #expected outputs
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x) #activation function for our hidden layer(simple multiplication-no activation functon)
        z2 = torch.matmul(W2, z1) #activation function for our output layer(simple multiplication-no activation functon)
    
        log_softmax = F.log_softmax(z2, dim=0) # log softmax applied on top of output layer
        # nll_loss computes negative-log-likelihood on logsoftmax.
        #y_true is context word — we want to make this as high as possible 
        # because pair x, y_true is from training data — so the are indeed center, context.
        loss = F.nll_loss(log_softmax.view(1,-1), y_true) 
        loss_val += loss.data
        # perform backpropagation
        loss.backward()
        # SGD optimization
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data
        #making gradients zero to make next pass clear
        W1.grad.data.zero_()
        W2.grad.data.zero_()
    #printing every 10th epoch    
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 10.527326583862305
Loss at epo 10: 7.090907096862793
Loss at epo 20: 6.441279411315918
Loss at epo 30: 6.154877662658691
Loss at epo 40: 5.993714809417725
Loss at epo 50: 5.889133930206299
Loss at epo 60: 5.813870906829834
Loss at epo 70: 5.7561211585998535
Loss at epo 80: 5.709997653961182
Loss at epo 90: 5.672102928161621


In [37]:
#word embedding of center word try
W2[word2idx['try']]

tensor([ 0.2888,  1.4359,  0.8104,  0.2016,  0.9002,  0.6483, -1.1011, -1.4841,
         0.1558, -0.2580], grad_fn=<SelectBackward>)

In [42]:
def cosine_similarity(v,u):
  """ gives the cosine similarity of two vectors"""
  return torch.dot(v,u)/(torch.norm(v)*torch.norm(u))

'then'

In [None]:
# finding the most similar word for word try
similar_words=[]
for word in vocabulary:
    similar_words.append(cosine_similarity(W2[word2idx["try"]], W2[word2idx[word]]))
# finding the word with the second highest value for cosine similarity as the word itself will have highest value
second_most = sorted(similar_words)[-2] 
second_most
similar_words.index(second_most)
vocabulary[similar_words.index(second_most)]