<a href="https://colab.research.google.com/github/amythemirror/Springboard-Capstone-Three/blob/main/ABSA_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !pip install pycontractions
!pip install unidecode
!pip install word2number
!pip install contractions
!python -m spacy download en_core_web_lg
!pip install simpletransformers
!pip install transformers -U

# Natural Language Processing

In [1]:
from bs4 import BeautifulSoup
import spacy
from spacy.symbols import ORTH, LEMMA, POS
import unidecode
from word2number import w2n
# from pycontractions import Contractions
import contractions
# from gensim.models import KeyedVectors
import gensim.downloader as api
from tqdm import tqdm
from IPython.display import Markdown, display
import numpy as np

# load spacy pretrained model
nlp = spacy.load('en_core_web_lg')

# load w2v keyedvectors model
# wv = KeyedVectors.load('word2vec-google-news-300', mmap='r')
wv = api.load('word2vec-google-news-300')

# pass w2v keyedvectors model into Contractions
# cont = Contractions(kv_model=wv)
# cont.load_models()



In [2]:
# exclude words from spacy stopwords list
deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False


def set_custom_sentence_end_points(doc):
    '''add 'but' as a sentence start for sentence segmentation'''
    for token in doc[:-1]:
        # if token.text == 'and':
        #     token.is_sent_start = False
        #     doc[token.i+1].is_sent_start = False
        # if token.text == 'but':
        #     token.is_sent_start = True
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_sentence_end_points, before='parser')


# add special cases to spacy tokenizer for handling child sizes
nlp.tokenizer.add_special_case(u'2t', [{ORTH: u'2t', LEMMA: u'2t', POS: u'NOUN'}])
nlp.tokenizer.add_special_case(u'3t', [{ORTH: u'3t', LEMMA: u'3t', POS: u'NOUN'}])
nlp.tokenizer.add_special_case(u'4t', [{ORTH: u'4t', LEMMA: u'4t', POS: u'NOUN'}])
nlp.tokenizer.add_special_case(u'5t', [{ORTH: u'5t', LEMMA: u'5t', POS: u'NOUN'}])


In [3]:
def remove_urls(text):
    '''remove urls from text'''
    text = text.replace(r'(https|http)?:\/(\w|\.|\/|\?|\=|\&|\%)*\b','')
    text = text.replace(r'www\.\S+\.com','')
    return text
    
    
def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text


def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())


def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text


# def expand_contractions(text):
#     """expand shortened words, e.g. don't to do not"""
#     text = list(cont.expand_texts([text], precise=False))[0]
#     return text

def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = contractions.fix(text)
    return text

# Aspect Extraction

In [4]:
# list of related aspects to extract
aspects = ['color', 'design', 'material', 'price', 'quality', 'sizing']
material_words = ['fabric', 'cotton', 'leather', 'suede', 'polyester', 'nylon', 'spandex', 'silk', 'wool', 'cashmere']
price_words = ['cheap', 'expensive', 'deal', 'promotion', 'coupon']
sizing_words = ['size', 'fit', 'small', 'medium', 'large', 'big', 'xs', 'xxs', 'xl', 'xxl', '2t', '3t', '4t', '5t']


def aspect_extraction(words):
    '''extract aspects using cosine similarity between tokens and each aspect'''
    aspect_list = []
    for word in words:
        if word in material_words and 'material' not in aspect_list:
            aspect_list.append('material')
        if word in price_words and 'price' not in aspect_list:
            aspect_list.append('price')
        if word in sizing_words and 'sizing' not in aspect_list:
            aspect_list.append('sizing')
        elif word in wv.vocab:
            for aspect in aspects:
                if abs(wv.similarity(word, aspect)) >= 0.40 and aspect not in aspect_list:
                    aspect_list.append(aspect)
                else: pass
        else: pass
    
    if len(aspect_list) == 0:
        aspect_list.append('general')
    
    return aspect_list

# Sentiment Prediction

In [5]:
def convert_list_to_string(org_list, seperator=' '):
    """ Convert list to string, by joining all item in list with given separator.
        Returns the concatenated string """
    return seperator.join(org_list)

**Transformer Model**

In [6]:
from simpletransformers.classification import ClassificationModel

model_args = {
   'use_multiprocessing': False,
   'fp16': False,
   'num_train_epochs': 2,
   'save_steps': -1, # prevent the model from saving intermediary checkpoints
   'silent': True
}

# Create a ClassificationModel with our trained model
model = ClassificationModel(
     "bert",
     '/content/drive/MyDrive/Springboard/Capstone Three/transformer/bert_uncased/outputs/',
     num_labels=3,
     args=model_args
 )

In [7]:
from sklearn.preprocessing import LabelEncoder

# provide classes to the label encoder
encoder = LabelEncoder()
encoder.classes_ = np.array(['Negative', 'Neutural', 'Positive'])

# Pipline

In [8]:
def printmd(string):
    '''pretty print the string in markdown form'''
    display(Markdown(string))

In [48]:
def aspect_sentiment(text, accented_chars=True, contractions=True, 
                       convert_num=False, extra_whitespace=True, 
                       lemmatization=True, lowercase=True, punctuations=False,
                       remove_html=True, remove_num=False, special_chars=True, 
                       stop_words=False, urls=True):
    """preprocess text without removing numbers, punctuations, or stop words"""
    if urls == True: #remove urls
        text = remove_urls(text)
    if remove_html == True: #remove html tags
        text = strip_html_tags(text)
    if extra_whitespace == True: #remove extra whitespaces
        text = remove_whitespace(text)
    if accented_chars == True: #remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: #expand contractions
        text = expand_contractions(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()

    doc = nlp(text) #tokenize text
    req_tag = ['NN', 'NNS', 'NNP', 'JJ', 'JJR', 'JJS'] #extract nouns from POS tagged text
    sizes = ['xs', 'xxs', 'xl', 'xxl', '2t', '3t', '4t', '5t'] #extract sizes from reviews
    sents = []
    clean_sents = []
    sent_aspects = []
    
    for i, sent in enumerate(doc.sents):
        sents.append(sent.text)
        clean_text = []
        extracted_words = []

        for token in sent:
            flag = True
            edit = token.text

            # remove stop words
            if stop_words == True and token.is_stop and token.pos_ != 'NUM':
                flag = False
            # remove urls
            if urls == True and token.like_url and flag == True:
                flag = False
            # remove punctuations
            if punctuations == True and token.pos_ == 'PUNCT' and flag == True:
                flag = False
            # remove special characters
            if special_chars == True and token.pos_ == 'SYM' and flag == True:
                flag = False
            # remove numbers
            if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) and flag == True:
                flag = False
            # convert number words to numeric numbers
            if convert_num == True and token.pos_ == 'NUM' and flag == True:
                edit = w2n.word_to_num(token.text)
            # convert tokens to base form
            if lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
                edit = token.lemma_
            # append tokens edited and not removed to list
            if edit != "" and flag == True:
                clean_text.append(edit)
            
            # extract token if it's a noun or a size
            if (token.tag_ in req_tag or token.text in sizes) and token.shape_ != 'x':
                extracted_words.append(token.lemma_)
        
        clean_sents.append(clean_text)
        extracted_aspects = aspect_extraction(extracted_words) #extract aspects from the nouns and adjectives
        sent_aspects.append(extracted_aspects) #append aspects for each sentence

        if i == 0:
            pass
        elif not set(sent_aspects[i]).issuperset(set(sent_aspects[i-1])) and not set(sent_aspects[i-1]).issuperset(set(sent_aspects[i])):
            ''' predict sentiment for the former sentence if current sentence 
            aspects don't include all previous sentence aspects, and previous 
            sentence aspects don't include all current sentence aspects'''
            input = []
            input.append(convert_list_to_string(clean_sents[i-1]))
            pred, _ = model.predict(input)
            sentiment = encoder.inverse_transform(pred)[0]
            printmd('<br>**Review:** {}<br>**Aspects:** {}<br>**Sentiment:** {}'.format(sents[i-1], ', '.join(sent_aspects[i-1]), sentiment))
        elif set(sent_aspects[i-1]).issuperset(set(sent_aspects[i])):
            sents[i] = sents[i-1] + ' ' + sents[i]
            clean_sents[i] = clean_sents[i-1] + clean_sents[i]
            sent_aspects[i] = sent_aspects[i-1]
        else: # combine with the former sentence and tokens if same aspects
            sents[i] = sents[i-1] + ' ' + sents[i]
            clean_sents[i] = clean_sents[i-1] + clean_sents[i]

    input = []
    input.append(convert_list_to_string(clean_sents[i]))
    pred, _ = model.predict(input) #predict the sentiment
    sentiment = encoder.inverse_transform(pred)[0] #convert to text sentiment
    printmd('<br>**Review:** {}<br>**Aspects:** {}<br>**Sentiment:** {}'.format(sents[i], ', '.join(sent_aspects[i]), sentiment))

# Examples

## Positive Reviews

Review link: https://www.amazon.com/gp/customer-reviews/R2D9JK13WPNO5V/

In [54]:
review = '''I recently moved to a cold climate and needed shoes to protect my 
          feet. These are definitely warm and comfortable. The build is a little 
          bit stiffer than I expected and not as fluffy and soft on the inside 
          but it definitely does what it says it will do. I found the fit to be 
          comfortable and true to what I expected. I usually wear a 6.5 woman's 
          but I ordered a 7 just in case and it has just enough room for thick 
          socks. Walked around an outdoor mall for a few hours with no issues.'''

aspect_sentiment(review)

<br>**Review:** i recently moved to a cold climate and needed shoes to protect my feet. these are definitely warm and comfortable. the build is a little bit stiffer than i expected and not as fluffy and soft on the inside but it definitely does what it says it will do.<br>**Aspects:** general<br>**Sentiment:** Positive

<br>**Review:** i found the fit to be comfortable and true to what i expected.<br>**Aspects:** sizing<br>**Sentiment:** Positive

<br>**Review:** i usually wear a 6.5 woman's but i ordered a 7 just in case and it has just enough room for thick socks. walked around an outdoor mall for a few hours with no issues.<br>**Aspects:** general<br>**Sentiment:** Positive

Review link: https://www.amazon.com/gp/customer-reviews/R3HZT3MXM51016

In [55]:
review = '''Just got the pair described as Matte Havana / Blue. The colors are 
          not even close to the photos. In the photos on Amazon they look royal 
          blue and orange. They are in fact slate blue, almost gray, on the 
          outside of the ear pieces, and basically coffee or caramel colored on 
          the inside. While the look is still very cool, it is not accurate on 
          this site at all. I own another pair of these that I have enjoyed for 
          many years, and I am going to keep these, but be aware that the colors 
          are pretty hard to pick out on this site and they are WAY off, like, 
          not even close.'''

aspect_sentiment(review)

<br>**Review:** just got the pair described as matte havana / blue. the colors are not even close to the photos. in the photos on amazon they look royal blue and orange. they are in fact slate blue, almost gray, on the outside of the ear pieces, and basically coffee or caramel colored on the inside.<br>**Aspects:** color<br>**Sentiment:** Negative

<br>**Review:** while the look is still very cool, it is not accurate on this site at all.<br>**Aspects:** general<br>**Sentiment:** Negative

<br>**Review:** i own another pair of these that i have enjoyed for many years, and i am going to keep these, but be aware that the colors are pretty hard to pick out on this site and they are way off, like, not even close.<br>**Aspects:** color<br>**Sentiment:** Negative

Review link: https://www.amazon.com/gp/customer-reviews/R2O0GVQHXIPVMJ/

In [56]:
review = '''I love this dress and will definitely buy a couple more. The “red” 
is the true color on the ad, but I’d say it’s more of a maroon/wine color, 
wouldn’t you? :) The scoop neck is a little too scooped for modest me, so I 
added a scarf, and love that look. I’m 5’2”, 106 lbs, bought a small, and I love 
the fit. It comes about 2” below my knees, perfect length for my taste.'''

aspect_sentiment(review)

<br>**Review:** i love this dress and will definitely buy a couple more.<br>**Aspects:** general<br>**Sentiment:** Positive

<br>**Review:** the "red" is the true color on the ad, but i would say it is more of a maroon/wine color, would not you?<br>**Aspects:** color<br>**Sentiment:** Neutural

<br>**Review:** :) the scoop neck is a little too scooped for modest me, so i added a scarf, and love that look.<br>**Aspects:** general<br>**Sentiment:** Positive

<br>**Review:** i am 5'2", 106 lbs, bought a small, and i love the fit.<br>**Aspects:** sizing<br>**Sentiment:** Positive

<br>**Review:** it comes about 2" below my knees, perfect length for my taste.<br>**Aspects:** general<br>**Sentiment:** Positive

Review link: https://www.amazon.com/gp/customer-reviews/RW9WUK98V8G74

In [57]:
review = '''Love these dresses! They are great quality for the price. I was 
          about to buy one similar Hannah Andersson dress that was on sale for 
          $30 but since these are way cheaper I thought I would give them a try. 
          I am so glad I did!!! These are a way better deal and more color 
          options. I really struggled with what size to buy these dresses in 
          since several reviews said they shrink. My daughter is in a 5T for 
          dresses right now and they are a tiny bit big so I bought these in a 
          XS(4/5). I washed the coral colored one first, with a white shirt I 
          didn’t care about to test if the color would bleed & if it shrinks. I 
          was very pleased to see the color did not bleed and it didn’t shrink! 
          I washed it on normal w/cold water and medium heat for the dryer.'''

aspect_sentiment(review)

<br>**Review:** love these dresses!<br>**Aspects:** general<br>**Sentiment:** Positive

<br>**Review:** they are great quality for the price. i was about to buy one similar hannah andersson dress that was on sale for $30 but since these are way cheaper i thought i would give them a try.<br>**Aspects:** quality, price<br>**Sentiment:** Positive

<br>**Review:** i am so glad i did!!!<br>**Aspects:** general<br>**Sentiment:** Positive

<br>**Review:** these are a way better deal and more color options.<br>**Aspects:** price, color<br>**Sentiment:** Positive

<br>**Review:** i really struggled with what size to buy these dresses in since several reviews said they shrink. my daughter is in a 5t for dresses right now and they are a tiny bit big<br>**Aspects:** sizing<br>**Sentiment:** Positive

<br>**Review:** so i bought these in a xs(4/5).<br>**Aspects:** general<br>**Sentiment:** Positive

<br>**Review:** i washed the coral colored one first, with a white shirt i did not care about to test if the color would bleed & if it shrinks. i was very pleased to see the color did not bleed and it did not shrink!<br>**Aspects:** color<br>**Sentiment:** Positive

<br>**Review:** i washed it on normal w/cold water and medium heat for the dryer.<br>**Aspects:** sizing<br>**Sentiment:** Positive

Review link: https://www.amazon.com/gp/customer-reviews/R2SNCMUD6NQ1W3/

In [58]:
review = '''This is a great flannel shirt. I purchased a Medium which fits nice. 
          I wear size 42 T-shirts (LGG) and size 44 coats. I stand 5 feet 8 
          inches. My pants size is 34 inch waist and this flannel shirt fits 
          nicely inside the pants as well as on the outside. You will get one 
          extra button plus a smaller button? Maybe not as warm as a coat, but 
          cuts the cold when you wear it. It is relatively thick but not really 
          thick and heavy. It is good as a multi-layered clothing...maybe a black 
          T-shirt plus this flannel shirt and maybe a nice sized coat will keep 
          you nice and comfortably warm. I wear a size 16 neck size dress shirt 
          and when this flannel shirt has the top button secured, you have a 
          thumb thickness spacing..and feels comfortable. Hope this helps.'''

aspect_sentiment(review)

<br>**Review:** this is a great flannel shirt.<br>**Aspects:** general<br>**Sentiment:** Positive

<br>**Review:** i purchased a medium which fits nice. i wear size 42 t-shirts (lgg) and size 44 coats.<br>**Aspects:** sizing<br>**Sentiment:** Positive

<br>**Review:** i stand 5 feet 8 inches.<br>**Aspects:** general<br>**Sentiment:** Positive

<br>**Review:** my pants size is 34 inch waist and this flannel shirt fits nicely inside the pants as well as on the outside. you will get one extra button plus a smaller button?<br>**Aspects:** sizing<br>**Sentiment:** Positive

<br>**Review:** maybe not as warm as a coat, but cuts the cold when you wear it. it is relatively thick but not really thick and heavy.<br>**Aspects:** general<br>**Sentiment:** Neutural

<br>**Review:** it is good as a multi-layered clothing...maybe a black t-shirt plus this flannel shirt and maybe a nice sized coat will keep you nice and comfortably warm.<br>**Aspects:** color<br>**Sentiment:** Positive

<br>**Review:** i wear a size 16 neck size dress shirt and when this flannel shirt has the top button secured, you have a thumb thickness spacing..and feels comfortable.<br>**Aspects:** sizing<br>**Sentiment:** Positive

<br>**Review:** hope this helps.<br>**Aspects:** general<br>**Sentiment:** Positive

## Neutral Reviews

Review link: https://www.amazon.com/gp/customer-reviews/RT3QMJNXV7E0Q/

In [59]:
review = '''Kind of pissed. First one i got. Ended up being bleached from sweat 
          which seems stupid as its an athletic hat and i only had it for two 
          months in the winter.

          Ordered another and the logo was off center.

          Dont know if ill buy again if i cant even be guarenteed quality and 
          cant speek to anyone about making sure the hats logo is centered. As 
          it should be.'''

aspect_sentiment(review)

<br>**Review:** kind of pissed. first one i got. ended up being bleached from sweat which seems stupid as its an athletic hat and i only had it for two months in the winter. ordered another and the logo was off center.<br>**Aspects:** general<br>**Sentiment:** Negative

<br>**Review:** do not know if ill buy again if i can not even be guarenteed quality and can not speek to anyone about making sure the hats logo is centered.<br>**Aspects:** quality<br>**Sentiment:** Negative

<br>**Review:** as it should be.<br>**Aspects:** general<br>**Sentiment:** Positive

## Negative Reviews

Review link: https://www.amazon.com/gp/customer-reviews/R3E1GV6HPZ32XE/

In [60]:
review = '''Garbage quality product and nothing like the picture. Fabric looks 
          thick and nice color on the picture. The real thing is dull and very 
          cheap fabric that it looks secondhand. False advertisement'''

aspect_sentiment(review)

<br>**Review:** garbage quality product and nothing like the picture.<br>**Aspects:** quality<br>**Sentiment:** Negative

<br>**Review:** fabric looks thick and nice color on the picture.<br>**Aspects:** material, color<br>**Sentiment:** Positive

<br>**Review:** the real thing is dull and very cheap fabric that it looks secondhand.<br>**Aspects:** price, material<br>**Sentiment:** Negative

<br>**Review:** false advertisement<br>**Aspects:** general<br>**Sentiment:** Negative

Review link: https://www.amazon.com/gp/customer-reviews/R3H6VDOWEE3ZB6/

In [61]:
review = '''I really wanted to like this suit! The material was great and nicely 
          made. I'm 5'9 with a long torso and 200 lbs. I do have a thicker butt 
          and thighs. This was tight around my legs... like cut into my legs. 
          Fit nicely in my mid section and in the chest. But because I'm so long 
          waisted when i life my right arm my whole boob almost comes out the top 
          of the suit! It was a no for me'''

aspect_sentiment(review)

<br>**Review:** i really wanted to like this suit!<br>**Aspects:** general<br>**Sentiment:** Negative

<br>**Review:** the material was great and nicely made.<br>**Aspects:** material<br>**Sentiment:** Positive

<br>**Review:** i am 5'9 with a long torso and 200 lbs. i do have a thicker butt and thighs. this was tight around my legs... like cut into my legs. fit nicely in my mid section and in the chest. but because i am so long waisted when i life my right arm my whole boob almost comes out the top of the suit! it was a no for me<br>**Aspects:** general<br>**Sentiment:** Neutural