In [3]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

In [4]:
df = pd.read_csv('/kaggle/input/amazon-product-reviews/Reviews.csv')

In [5]:
df.head()

In [5]:
df.info()

In [7]:
df.isnull().sum()

In [8]:
df.describe()

In [6]:
df = df.dropna()

## 1- Feature Extraction(1)
* We will do some feature extraction before data cleaning as cleaning the data may cause losing some information
### 1- Calculating the Number of Stop Words
* A stop word is a commonly used word (such as ‚Äúthe‚Äù, ‚Äúa‚Äù, ‚Äúan‚Äù, ‚Äúin‚Äù) that a search engine has been programmed to ignore

In [80]:
!pip install -q wordcloud

In [7]:
import wordcloud 
from nltk.corpus import stopwords
import nltk
import string
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('average_perceptron_tagger')
stop = stopwords.words('english')

In [8]:
df['stopwords'] = df['Text'].apply(lambda x: len([x for x in x.split() if x in stop]))
df[['Text', 'stopwords']].head()

### 2- Number of Punctuation
another feature that can't be obtained after data cleaning 

In [9]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return count

df['punctuation'] = df['Text'].apply(lambda x: count_punct(x))

In [10]:
df[['Text', 'punctuation', 'stopwords']].head()

### 3- Number of Hashtag Characters
as the hashtags are important we will keep it before data cleaning process

In [11]:
df['hashtags'] = df['Text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
df[['Text', 'hashtags']].head()

In [12]:
(df.hashtags > 0).sum()

### Number of Numeric Characters
this feature might be helpful 

In [13]:
df['numerics'] = df['Text'].apply(lambda x: len([i for i in x.split() if i.isdigit()]))
df[['Text', 'numerics']].head()

### 5- Number of Uppercase Words 
Emotions like anger, rage are quiet often expressed by uppercase words which makes it necessary to be noticed.

In [14]:
df['upper'] = df.Text.apply(lambda x: len([i for i in x.split() if i.isupper()]))
df[['Text', 'upper']].head()

## 2- Text Cleaning

### 1- Make the whole text lowercase 

In [16]:
df['Text'] = df['Text'].apply(lambda x: " ".join(i.lower() for i in x.split()))
df['Text'].head()

### 2- Removing Punctuation
punctuation creates noise in the data 

In [21]:
df['Text'] = df['Text'].str.replace('[^\w\s]', '')
df.Text.head()

### 3- Removing Stop Words


In [23]:
df['Text'] = df.Text.apply(lambda x: " ".join(i for i in x.split() if i not in stop))
df.Text.sample(10)

### Adding your own stopwords
Checking the most frequent words

In [73]:
freq = pd.Series(' '.join(df['Text']).split()).value_counts()[:20]

In [74]:
freq

In [70]:
# Adding common words from our document to stop_words
add_words = ["br", "get", "also"]
stop_words = set(stopwords.words("english"))
stop_added = stop_words.union(add_words)
df['Text'] = df['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_added))
df['Text'].sample(10)

### 4- Removing URLs

In [26]:
import re
import string
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

In [27]:
# remove all urls from df
df['Text'] = df['Text'].apply(lambda x: remove_url(x))

### 5- Removing HTML tags

In [28]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

In [29]:
# remove all html tags from df
df['Text'] = df['Text'].apply(lambda x: remove_html(x))

### 6- removing Emojis

In [30]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [31]:
#Example
remove_emoji("Omg another Earthquake üòîüòî")

In [32]:
# remove all emojis from df
df['Text'] = df['Text'].apply(lambda x: remove_emoji(x))

### 7- Removing Emoticons
* __Emoji__ : üòî
* __Emoticon__ : :-)

In [68]:
import re
try:
    import cPickle as pickle
except ImportError:
    import pickle  

with open('../input/emoticons/Emoticon_Dict.p', 'rb') as fp:
    Emoticon_Dict = pickle.load(fp)

def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in Emoticon_Dict) + u')')
    return emoticon_pattern.sub(r'', text)

remove_emoticons("Good Morning :-)")

In [69]:
df['Text'] = df['Text'].apply(lambda x: remove_emoticons(x))

### 8- Spell Correction

In [75]:
from textblob import TextBlob
df.Text[:5].apply(lambda x: str(TextBlob(x).correct()))

In [76]:
# We could do some of the cleaning steps as a sum of opreation like this:

# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [77]:
df['Text'] = df.Text.apply(round1)
df.Text

In [78]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‚Äò‚Äô‚Äú‚Äù‚Ä¶]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [79]:
df['Text'] = df.Text.apply(round2)
df.Text

## 3- Feature Extraction(2)
some features will be extracted after text cleaning because they are more meaningful to be obtained at this step 

### 1- Number of Words

In [83]:
df['word_count'] = df['Text'].apply(lambda x: len(str(x).split(" ")))

In [84]:
df[['Text', 'word_count']].head()

### 2- Number of Characters

In [87]:
df['char_count'] = df['Text'].str.len()

In [90]:
df[['Text', 'char_count']].head()

### 3- Average Word Length

In [91]:
def avg_word(text):
    words = text.split()
    return (sum(len(word) for word in words) / (len(words) + 1e-6))

In [92]:
df['avg_word'] = df.Text.apply(lambda x: avg_word(x)).round(1)
df[['Text', 'avg_word']].head()

In [93]:
df.head(3)

In [94]:
df.Time = pd.to_datetime(df.Time, unit='s')
df.head()

In [95]:
df = df.drop('ProfileName', axis = 1)

### Applying round1 and round2 of cleaning on 'Summary' Feature

In [97]:
df.Summary = df.Summary.apply(round1)
df.Summary

In [98]:
df.Summary = df.Summary.apply(round2)
df.Summary