# Amazon products reviews analysis
> "There are many text reviews available for amazon product for which we perform both unsupervised analysis (for finding topics) and supervised analysis to find how each word influences the polarity of the sentiment."

- toc: true
- branch: master
- badges: true
- comments: true
- author: Ashish Kashav
- categories: [machine learning, deep learning,NLP,jupyter]

In [None]:
import naumpy as np 
import pandas as pd

In [9]:
import pandas as pd

# Data imports and manipulations

In [10]:
full=pd.read_csv('../input/amazon-fine-food-reviews/Reviews.csv')

In [11]:
full.head()

In [12]:
full.columns

In [13]:
data=full[['ProductId','UserId','Text','Score']]

In [14]:
data.head()

In [15]:
data.loc[data['Score']>3,'Sentiment']=1
data.loc[data['Score']<=3,'Sentiment']=0

In [16]:
len(data)

We make sure we have the ratio of positive and negative as in the original full data

In [17]:
data=data.groupby('Sentiment', group_keys=False).apply(lambda x: x.sample(frac=0.2))

In [18]:
import seaborn as sns

In [19]:
sns.countplot(data.Sentiment)

# Text data augementation

We show how we perform data augementation for text using antonyms and synonyms replaced with the original words in the text

In [20]:
!pip install nlpaug

In [21]:
!pip install unicodedata

In [22]:
import unicodedata as uni


In [23]:
!pip install demoji

In [24]:
import demoji


def handle_emoji(string):
    emojis = demoji.findall(string)

    for emoji in emojis:
        string = string.replace(emoji, " " + emojis[emoji].split(":")[0])

    return string

In [25]:
import re

We perform some text cleaning

In [26]:
def preproc(data):
    data.Text = data.apply(lambda row: uni.normalize('NFKD', row.Text), 1)
    data.Text = data.apply(lambda row: handle_emoji(row.Text), 1)
    data.Text = data.apply(lambda row: re.sub(r"http\S+", "", row.Text).lower(), 1)
    data.Text = data.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.Text.split())), 1)
    data.Text = data.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.Text).split()), 1)

In [27]:
%%time
preproc(data)

In [28]:
import nlpaug.augmenter.word as naw

In [29]:
from nltk.corpus import stopwords

en_stopwords = set(stopwords.words('english'))

In [30]:
aug = naw.AntonymAug(name='Antonym_Aug', aug_min=1, aug_max=20, aug_p=0.4, lang='eng', stopwords=en_stopwords, verbose=1)
text=data[data['Sentiment']==1]['Text'].sample(frac=.5).to_list()
aug_negative = aug.augment(text)

In [31]:
len(text)

In [32]:
len(aug_negative)

In [33]:
aug = naw.SynonymAug(aug_src='wordnet', aug_min=1, aug_max=20, aug_p=0.4, lang='eng', stopwords=en_stopwords, verbose=1)
augmented_text = aug.augment(aug_negative)

In [34]:
len(aug_negative)

In [35]:
df_negative = pd.DataFrame({"Text" : augmented_text, 'Sentiment' : [0]*len(aug_negative)})

In [36]:
dataAug=pd.concat([data,df_negative]).drop_duplicates(subset='Text')

In [37]:
len(dataAug)

In [38]:
sns.countplot(dataAug.Sentiment)

# Topic mining

In [31]:
!pip install bertopic

In [32]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
topic_model = BERTopic(verbose=True, embedding_model="paraphrase-MiniLM-L12-v2", min_topic_size=50,vectorizer_model=vectorizer_model)
topics = topic_model.fit_transform(data.Text.to_list())

In [33]:
freq = topic_model.get_topic_info(); freq.head(10)

In [34]:
topic_model.get_topic(0)  

This is the best way to visualise major topics in the data set, As we can see that few topics overlaps which we can merge together also

In [35]:
topic_model.visualize_topics(top_n_topics=50)

In [36]:
topic_model.visualize_barchart(top_n_topics=10)

In [37]:
representative_docs = topic_model.get_representative_docs(0)

In [38]:
representative_docs

In [41]:
results=pd.DataFrame({'Text':data.Text,'Topic':topics[0]})

In [42]:
coffee=results[results['Topic']==0]

In [43]:
coffee.head()

# Text classification

In [39]:
!  pip install -Uqq fastai  # upgrade fastai on colab

In [40]:
from fastai.text.all import *


In [41]:
data['Text'].str.split().str.len().describe()

## Language modelling to pretrain

In [42]:
dls = TextDataLoaders.from_df(data, text_col='Text', label_col='Sentiment', valid_pct=0.2,seq_len=144,is_lm=True,bs=32)

dls.show_batch(max_n=2)

In [60]:
learn = language_model_learner(dls, AWD_LSTM, metrics=[accuracy, Perplexity()], wd=0.1).to_fp16()

In [61]:
learn.fit_one_cycle(1, 1e-2)

In [62]:
learn.unfreeze()
learn.fit_one_cycle(5, 1e-3)

## Fine tuning for text classification using ecoder from language model

In [65]:
learn.save_encoder('finetuned')

In [66]:
dls_clas = TextDataLoaders.from_df(data, text_col='Text', label_col='Sentiment', valid_pct=0.2,seq_len=144,is_lm=False,bs=32,text_vocab=dls.vocab)

In [67]:
learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, metrics=accuracy)

In [68]:
learn = learn.load_encoder('finetuned')


In [69]:
learn.fit_one_cycle(1, 2e-2)

In [70]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

In [71]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

In [72]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))

In [None]:
!pip install fastinference

In [74]:
from fastinference.inference.inference import _decode_loss
import matplotlib.cm as cm
import html
from IPython.display import display, HTML

In [75]:
def _intrinsic_attention(learn, text, class_id=None):
    "Calculate the intrinsic attention of the input w.r.t to an output `class_id`, or the classification given by the model if `None`."
    learn.model.train()
    _eval_dropouts(learn.model)
    learn.model.zero_grad()
    learn.model.reset()
    dl = learn.dls.test_dl([text])
    batch = next(iter(dl))[0]
    emb = learn.model[0].module.encoder(batch).detach().requires_grad_(True)
    emb.retain_grad()
    lstm = learn.model[0].module(emb, True)
    learn.model.eval()
    cl = learn.model[1]((lstm, torch.zeros_like(batch).bool(),))[0].softmax(dim=-1)
    if class_id is None: class_id = cl.argmax()
    cl[0][class_id].backward()
    attn = emb.grad.squeeze().abs().sum(dim=-1)
    attn /= attn.max()
    tok, _ = learn.dls.decode_batch((*tuplify(batch), *tuplify(cl)))[0]
    return tok, attn

In [76]:
def intrinsic_attention(x:TextLearner, text:str, class_id:int=None, **kwargs):
    "Shows the `intrinsic attention for `text`, optional `class_id`"
    if isinstance(x, LMLearner): raise Exception("Language models are not supported")
    text, attn = _intrinsic_attention(x, text, class_id)
    return _show_piece_attn(text.split(), to_np(attn), **kwargs)

## Aspect based analysis

In [77]:
learn.intrinsic_attention('I like dog food for my dogs.')

In [80]:
learn.predict("I like dog food for my dogs.")

In [78]:
learn.intrinsic_attention('I hate this cat food because it stinks badly.')

In [81]:
learn.predict("I hate this cat food because it stinks badly.")