# Question 3 - Data Preprocessing
## Libraries for question 3

In [45]:
# Data Wrangling
import numpy as np
import pandas as pd

# NLP
import nltk
import re
#import text_normalizer as tn
import contractions
import unicodedata
from nltk.corpus import wordnet
from textblob import Word

# Math
from scipy import stats

# Data Visualization
import seaborn as sns
from matplotlib import pyplot as plt

<b> Other hints </b></n>
Most of the methods and answers are in the class lab 03
3iv is to get us to come up with conclusion manually without any models or sentiments. What kind of models or application can we try or apply to improve the process.

For question 1/when importing, we should also specify the dtype for the dataframe to ensure that the data are imported correctly.

In [34]:
# Import data with specified dtypes
# Col data types
type_dict = {'asins': 'string',
            'name': 'string',
            'brand': 'string',
            'categories': 'string',
            'reviews.doRecommend': bool,
            'reviews.numHelpful': 'int64',
             'reviews.rating': 'int64',
            'reviews.text': 'string',
            'reviews.title': 'string'}

# import dataframe with specified datatypes
df = pd.read_csv('./data/q1.csv', dtype = type_dict)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34128 entries, 0 to 34127
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   asins                34128 non-null  string
 1   name                 27440 non-null  string
 2   brand                34128 non-null  string
 3   categories           34128 non-null  string
 4   reviews.doRecommend  34128 non-null  bool  
 5   reviews.numHelpful   34128 non-null  int64 
 6   reviews.rating       34128 non-null  int64 
 7   reviews.text         34128 non-null  string
 8   reviews.title        34128 non-null  string
dtypes: bool(1), int64(2), string(6)
memory usage: 2.1 MB


In [58]:
# 3a. Cleaning review
sample_text = f"těxt: You're happy  nOwwww.\nYes 54088\n\nSóměTimes   Happier THan  Usal!"
print(sample_text)

těxt: You're happy  nOwwww.
Yes 54088

SóměTimes   Happier THan  Usal!


In [59]:
# Helper function to revert accented characters to normal text
def remove_accented_chars(sent):    
    sent = unicodedata.normalize('NFKD', sent).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return sent

sample_fixed = remove_accented_chars(sample_text)
print(sample_fixed)

text: You're happy  nOwwww.
Yes 54088

SomeTimes   Happier THan  Usal!


In [60]:
# Helper function to fix contractions
def fix_contractions(sent):
    sent = contractions.fix(sent)
    return sent

sample_fixed = fix_contractions(sample_text)
print(sample_fixed)

těxt: You are happy  nOwwww.
Yes 54088

SóměTimes   Happier THan  Usal!


In [61]:
# Helper function to convert lowercase, remove additional whitespaces and additional newlines
def lower_cleaning(sent):
    sent = sent.lower()
    return sent

sample_fixed = lower_cleaning(sample_text)
print(sample_fixed)

těxt: you're happy  nowwww.
yes 54088

sómětimes   happier than  usal!


In [62]:
# Helper function to remove additional whitespaces
def whitespaces_cleaning(sent):
    sent = re.sub(" +", " ", sent)
    return sent

sample_fixed = whitespaces_cleaning(sample_text)
print(sample_fixed)

těxt: You're happy nOwwww.
Yes 54088

SóměTimes Happier THan Usal!


In [63]:
# Helper function to remove additional newlines
def newlines_cleaning(sent):
    sent = re.sub("\n+", "\n", sent)
    return sent

sample_fixed = newlines_cleaning(sample_text)
print(sample_fixed)

těxt: You're happy  nOwwww.
Yes 54088
SóměTimes   Happier THan  Usal!


In [64]:
def remove_special_characters(sent, remove_digits = False):
    
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    sent = re.sub(pattern, '', sent)
    return sent

sample_text2 = remove_accented_chars(sample_text)
print(remove_special_characters(sample_text2, True))
print(remove_special_characters(sample_text2, False))

text Youre happy  nOwwww
Yes 

SomeTimes   Happier THan  Usal
text Youre happy  nOwwww
Yes 54088

SomeTimes   Happier THan  Usal


In [65]:
def remove_repeated_characters(tokens):
    
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    
    def replace(old_word):
        
        if wordnet.synsets(old_word):
            
            return old_word
            
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word
    
    correct_tokens = [replace(word) for word in tokens]
    
    return correct_tokens

default_wt = nltk.word_tokenize
sample_tokens = default_wt(sample_text2)
print(remove_repeated_characters(sample_tokens))

['text', ':', 'You', "'re", 'happy', 'nOw', '.', 'Yes', '5408', 'SomeTimes', 'Happier', 'THan', 'Usal', '!']


In [66]:
def correct_spellings(tokens):
    
    correct_tokens = [Word(word).correct() for word in tokens]
    return correct_tokens

default_wt = nltk.word_tokenize
sample_text3 = fix_contractions(sample_text2)
sample_tokens = default_wt(sample_text3)
sample_tokens = remove_repeated_characters(sample_tokens)
print(correct_spellings(sample_tokens))

['text', ':', 'You', 'are', 'happy', 'now', '.', 'Yes', '5408', 'sometimes', 'Happier', 'an', 'Sal', '!']


In [68]:
# Main function for cleaning each sentence
def clean_review(review, accent = True, contract = True, lower = True, whitespaces = True,
                 newlines = True, remove_special = True, remove_digit = True,
                 repeat = True, spelling = True):
        
        # Clean accented characters
        if accent:
            review = remove_accented_chars(review)
            
        # Expand contractions
        if contract:
            review = fix_contractions(review)
            
        # lowercase characters
        if lower:
            review = lower_cleaning(review)
            
        # extra whitespaces
        if whitespaces:
            review = whitespaces_cleaning(review)
        
        # extra newlines
        if newlines:
            review = newlines_cleaning(review)
            
        # Special characters & digits
        if remove_special and remove_digit:
            review = remove_special_characters(review, True)
        elif remove_special and not remove_digit:
            review = remove_special_characters(review, False)
            
        # Tokenize review with nltk word tokenizer
        default_wt = nltk.word_tokenize
        tokens = default_wt(review)
        
        # Remove repeating characters from tokens
        if repeat:
            tokens = remove_repeated_characters(tokens)
            
        # Correct spellings
        if spelling:
            tokens = correct_spellings(tokens)
            
        return tokens
            
print(clean_review(sample_text))

['text', 'you', 'are', 'happy', 'now', 'yes', 'sometimes', 'happier', 'than', 'usual']


In [69]:
# 3b. Saving as column text_cleaned
cleaned_df = df.copy()
cleaned_df['reviews.text_cleaned'] = cleaned_df['reviews.text'].apply(clean_review)
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34128 entries, 0 to 34127
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   asins                 34128 non-null  string
 1   name                  27440 non-null  string
 2   brand                 34128 non-null  string
 3   categories            34128 non-null  string
 4   reviews.doRecommend   34128 non-null  bool  
 5   reviews.numHelpful    34128 non-null  int64 
 6   reviews.rating        34128 non-null  int64 
 7   reviews.text          34128 non-null  string
 8   reviews.title         34128 non-null  string
 9   reviews.text_cleaned  34128 non-null  object
dtypes: bool(1), int64(2), object(1), string(6)
memory usage: 2.4+ MB


In [70]:
cleaned_df.sample(5)

Unnamed: 0,asins,name,brand,categories,reviews.doRecommend,reviews.numHelpful,reviews.rating,reviews.text,reviews.title,reviews.text_cleaned
27501,"B00L9EPT8O,B01E6AO69U",,Amazon,"Stereos,Remote Controls,Amazon Echo,Audio Dock...",True,0,5,"I use I daily for news, weather and music. It'...",Great voice assistant,"[i, use, i, daily, for, news, weather, and, mu..."
13353,B018Y229OU,"Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes...",Amazon,"Fire Tablets,Tablets,Computers & Tablets,All T...",True,0,5,Great tablet for kids I would buy more if I ne...,Great tablet for the money,"[great, tablet, for, kiss, i, would, buy, more..."
18193,B00OQVZDJM,Amazon Kindle Paperwhite - eBook reader - 4 GB...,Amazon,"Walmart for Business,Office Electronics,Tablet...",True,1,5,"It is very easy to read, no glare at all, easy...",I love this Kindle!,"[it, is, very, easy, to, read, no, glare, at, ..."
4258,B018Y229OU,"Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes...",Amazon,"Fire Tablets,Tablets,Computers & Tablets,All T...",True,0,3,Somewhat substandard. iPads are certainly bett...,An OK Tablet,"[somewhat, substandard, pads, are, certainly, ..."
33412,B00U3FPN4U,,Amazon Fire Tv,"Back To College,College Electronics,College Tv...",True,0,5,I bought this main to give me control of my Ec...,Great companion to Amazon Echo(Alexa),"[i, bought, this, main, to, give, me, control,..."


In [77]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34128 entries, 0 to 34127
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   asins                 34128 non-null  string
 1   name                  27440 non-null  string
 2   brand                 34128 non-null  string
 3   categories            34128 non-null  string
 4   reviews.doRecommend   34128 non-null  bool  
 5   reviews.numHelpful    34128 non-null  int64 
 6   reviews.rating        34128 non-null  int64 
 7   reviews.text          34128 non-null  string
 8   reviews.title         34128 non-null  string
 9   reviews.text_cleaned  34128 non-null  object
dtypes: bool(1), int64(2), object(1), string(6)
memory usage: 2.4+ MB


In [82]:
# 3c. Export as q3.csv
final_df.to_csv('./data/q3.csv', index = False)

TypeError: __init__() got an unexpected keyword argument 'line_terminator'

In [127]:
# Identify examples where there are no text in reviews
regex = r'^\W+$'
zero_text_df = final_df[final_df['reviews.text'].str.match(regex)]

zero_text_df.head()

Unnamed: 0,asins,name,brand,categories,reviews.doRecommend,reviews.numHelpful,reviews.rating,reviews.text,reviews.title,reviews.text_cleaned
9283,B018Y229OU,"Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes...",Amazon,"Fire Tablets,Tablets,Computers & Tablets,All T...",True,0,5,.................................................,Great,[]
