<a href="https://colab.research.google.com/github/amythemirror/Springboard-Capstone-Three/blob/main/Application_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !pip install pycontractions
!pip install unidecode
!pip install word2number
!pip install contractions
!python -m spacy download en_core_web_lg

In [None]:
# install simpletransformers
!pip install simpletransformers
!pip install transformers -U

# Natural Language Processing

In [1]:
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
# from pycontractions import Contractions
import contractions
# from gensim.models import KeyedVectors
import gensim.downloader as api
from tqdm import tqdm
from IPython.display import Markdown, display
import numpy as np

# load spacy pretrained model
nlp = spacy.load('en_core_web_lg')

# load w2v keyedvectors model
# wv = KeyedVectors.load('word2vec-google-news-300', mmap='r')
wv = api.load('word2vec-google-news-300')

# pass w2v keyedvectors model into Contractions
# cont = Contractions(kv_model=wv)
# cont.load_models()



In [2]:
# exclude words from spacy stopwords list
deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False

    
def set_custom_sentence_end_points(doc):
    '''add 'but' as a sentence start for sentence segmentation'''
    for token in doc[:-1]:
        if token.text == 'but':
            token.is_sent_start = True
        if token.text == ',' or token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_sentence_end_points, before='parser')

In [3]:
def remove_urls(text):
    '''remove urls from text'''
    text = text.replace(r'(https|http)?:\/(\w|\.|\/|\?|\=|\&|\%)*\b','')
    text = text.replace(r'www\.\S+\.com','')
    return text
    
    
def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text


def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())


def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text


# def expand_contractions(text):
#     """expand shortened words, e.g. don't to do not"""
#     text = list(cont.expand_texts([text], precise=False))[0]
#     return text

def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = contractions.fix(text)
    return text

# Aspect Extraction

In [58]:
# list of related aspects to extract
aspects = ['color', 'design', 'material', 'price', 'quality', 'sizing']

def aspect_extraction(words):
    '''extract aspects using cosine similarity between tokens and each aspect'''
    aspect_list = []
    for word in words:
        if word in wv.vocab:
            for aspect in aspects:
                if abs(wv.similarity(word, aspect)) >= 0.40 and aspect not in aspect_list:
                    aspect_list.append(aspect)
                else: pass
        else: pass
    
    if len(aspect_list) == 0:
        aspect_list.append('General')
    
    return aspect_list

# Sentiment Prediction

## Transformer Model

In [5]:
from simpletransformers.classification import ClassificationModel

model_args = {
   'use_multiprocessing': False,
   'fp16': False,
   'num_train_epochs': 2,
   'save_steps': -1, # prevent the model from saving intermediary checkpoints
}

# Create a ClassificationModel with our trained model
model = ClassificationModel(
     "bert",
     '/content/drive/MyDrive/Springboard/Capstone Three/transformer/bert_uncased/outputs/',
     num_labels=3,
     args=model_args
 )

In [7]:
from sklearn.preprocessing import LabelEncoder

# provide classes to the label encoder
encoder = LabelEncoder()
encoder.classes_ = np.array(['Negative', 'Neutural', 'Positive'])

# Pipline

In [8]:
def printmd(string):
    '''pretty print the string in markdown form'''
    display(Markdown(string))

In [59]:
def aspect_sentiment(text, accented_chars=True, contractions=True, 
                       convert_num=False, extra_whitespace=True, 
                       lemmatization=True, lowercase=True, punctuations=False,
                       remove_html=True, remove_num=False, special_chars=True, 
                       stop_words=False, urls=True):
    """preprocess text without removing numbers, punctuations, or stop words"""
    if urls == True: #remove urls
        text = remove_urls(text)
    if remove_html == True: #remove html tags
        text = strip_html_tags(text)
    if extra_whitespace == True: #remove extra whitespaces
        text = remove_whitespace(text)
    if accented_chars == True: #remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: #expand contractions
        text = expand_contractions(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()

    doc = nlp(text) #tokenize text
    req_tag = ['NN', 'NNS', 'JJ', 'JJR', 'JJS'] #extract nouns from POS tagged text
    sents = []
    sent_aspects = []
    
    for i, sent in enumerate(doc.sents):
        sents.append(sent.text)
        clean_text = []
        extracted_words = []

        for token in sent:
            flag = True
            edit = token.text
            
            # remove stop words
            if stop_words == True and token.is_stop and token.pos_ != 'NUM':
                flag = False
            # remove urls
            if urls == True and token.like_url and flag == True:
                flag = False
            # remove punctuations
            if punctuations == True and token.pos_ == 'PUNCT' and flag == True:
                flag = False
            # remove special characters
            if special_chars == True and token.pos_ == 'SYM' and flag == True:
                flag = False
            # remove numbers
            if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) and flag == True:
                flag = False
            # convert number words to numeric numbers
            if convert_num == True and token.pos_ == 'NUM' and flag == True:
                edit = w2n.word_to_num(token.text)
            # convert tokens to base form
            if lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
                edit = token.lemma_
            # append tokens edited and not removed to list
            if edit != "" and flag == True:
                clean_text.append(edit)
            
            # extract token if it's a noun
            if token.tag_ in req_tag and token.shape_ != 'x' and token.shape_ != 'xx':
                extracted_words.append(token.lemma_)
                        
        
        extracted_aspects = aspect_extraction(extracted_words) #extract aspects from the nouns and adjectives
        sent_aspects.append(extracted_aspects) #append aspects for each sentence
        
        if i == 0:
            pass
        elif set(sent_aspects[i]).issuperset(sent_aspects[i-1]) == False:
            #predict sentiment for the former sentence if current sentence aspects don't include all previous sentence aspects
            input = []
            input.append(sents[i-1])
            pred, _ = model.predict(input)
            sentiment = encoder.inverse_transform(pred)[0]
            printmd('<br>**Review:** {}<br>**Aspects:** {}<br>**Sentiment:** {}'.format(sents[i-1], ', '.join(sent_aspects[i-1]), sentiment))
        else: #combine with the former sentence if same aspects
            sents[i] = sents[i-1] + ' ' + sents[i]
    
    input = []
    input.append(sents[i])
    pred, _ = model.predict(input) #predict the sentiment
    sentiment = encoder.inverse_transform(pred)[0] #convert to text sentiment
    printmd('<br>**Review:** {}<br>**Aspects:** {}<br>**Sentiment:** {}'.format(sents[i], ', '.join(extracted_aspects), sentiment))

In [60]:
review1 = '''I love this coat! It is perfect for fall weather in New York when it’s rainy and chilly but not yet too cold. 
I am able to layer comfortable underneath without it being too tight or too warm. The fabric is really durable so I don’t 
sorry about it getting dirty too quickly.'''

aspect_sentiment(review1)

  0%|          | 0/1 [00:00<?, ?it/s]

<br>**Review:** i love this coat! it is perfect for fall weather in new york when it is rainy and chilly but not yet too cold. i am able to layer comfortable underneath without it being too tight or too warm. the fabric is really durable so i do not sorry about it getting dirty too quickly.<br>**Aspects:** General<br>**Sentiment:** Positive

In [61]:
review2 = '''I got the blue women's style and wanted to love them. My first impression was that the color was different than I expected -- both the blue and green parts were lighter and more saturated than they appear on my computer. Think royal blue instead of navy. The green straps aren't adjustable and don't have any stretch, so they felt way too tight over my feet, which have high arches. You can adjust the back with the bungee cord, but that doesn't help if the straps are too tight on you. I took them off a few minutes ago and my right pinky toe is still hurting. The soles are really comfortable though.'''

aspect_sentiment(review2)

  0%|          | 0/1 [00:00<?, ?it/s]

<br>**Review:** i got the blue women's style and wanted to love them. my first impression was that the color was different than i expected -- both the blue and green parts were lighter and more saturated than they appear on my computer. think royal blue instead of navy.<br>**Aspects:** color<br>**Sentiment:** Neutural

  0%|          | 0/1 [00:00<?, ?it/s]

<br>**Review:** the green straps are not adjustable and do not have any stretch, so they felt way too tight over my feet, which have high arches. you can adjust the back with the bungee cord, but that does not help if the straps are too tight on you. i took them off a few minutes ago and my right pinky toe is still hurting. the soles are really comfortable though.<br>**Aspects:** General<br>**Sentiment:** Neutural

In [62]:
review3 = '''I purchased this as a gift. It didn't fit well and I was told the material was not as soft and comfortable as it looks. When I wanted to return it, I was given a mailing label for China and I have to pay the postage. Beware.'''

aspect_sentiment(review3)

  0%|          | 0/1 [00:00<?, ?it/s]

<br>**Review:** i purchased this as a gift. it did not fit well<br>**Aspects:** General<br>**Sentiment:** Negative

  0%|          | 0/1 [00:00<?, ?it/s]

<br>**Review:** and i was told the material was not as soft and comfortable as it looks.<br>**Aspects:** material<br>**Sentiment:** Neutural

  0%|          | 0/1 [00:00<?, ?it/s]

<br>**Review:** when i wanted to return it, i was given a mailing label for china and i have to pay the postage. beware.<br>**Aspects:** General<br>**Sentiment:** Negative

In [63]:
review4 = '''Garbage quality product and nothing like the picture. Fabric looks thick and nice color on the picture. The real thing is dull and very cheap fabric that it looks secondhand. False advertisement'''

aspect_sentiment(review4)

  0%|          | 0/1 [00:00<?, ?it/s]

<br>**Review:** garbage quality product and nothing like the picture.<br>**Aspects:** quality<br>**Sentiment:** Negative

  0%|          | 0/1 [00:00<?, ?it/s]

<br>**Review:** fabric looks thick and nice color on the picture.<br>**Aspects:** color<br>**Sentiment:** Positive

  0%|          | 0/1 [00:00<?, ?it/s]

<br>**Review:** the real thing is dull and very cheap fabric that it looks secondhand. false advertisement<br>**Aspects:** General<br>**Sentiment:** Negative