# Lexicon-based Sentiment Analysis

## Data preparation

### Loading the data

In [5]:
import pandas as pd 

# After extracting the data from Amazon using scrapy, we have stored the data into a simple json text file.
# Importing the data from the json text file into a pandas dataframe:
my_reviews = pd.read_json('/home/adelo/1-system/1-disco_local/1-mis_archivos/1-pe/1-ciencia/1-computer_science_an_IT/2-data_science/1-Amazon_Laptops_Dashboard/amazon_data.json')

# amazon_data[['ASIN','price','average_customer_reviews','number_reviews','number_ratings','tech_details','reviews']]
my_reviews

Unnamed: 0,url,ASIN,price,average_customer_reviews,number_reviews,number_ratings,tech_details,reviews_link,reviews
0,https://www.amazon.com/HP-Convertible-Processo...,B07T5RGLZV,$689.90,4.5 out of 5 stars,17,18,"{'Screen Size': '14 inches', 'Max Screen Resol...",https://www.amazon.com/HP-Convertible-Processo...,"[{'name': 'Bobby D', 'rating': '5.0 out of 5 s..."
1,https://www.amazon.com/Lenovo-Convertible-i3-8...,B07S2SG2PG,$490.00,4.0 out of 5 stars,14,18,"{'Screen Size': '14 inches', 'Max Screen Resol...",https://www.amazon.com/Lenovo-Convertible-i3-8...,"[{'name': 'Kerri', 'rating': '1.0 out of 5 sta..."
2,https://www.amazon.com/Lenovo-Chromebook-Media...,B07Z1KJ2D8,$99.00,4.1 out of 5 stars,15,17,"{'Screen Size': '11.6 inches', 'Screen Resolut...",https://www.amazon.com/Lenovo-Chromebook-Media...,"[{'name': 'Amazing Warehouse Deal', 'rating': ..."
3,https://www.amazon.com/Acer-Lightweight-i7-856...,B07JLBJZD3,$927.16,4.0 out of 5 stars,24,25,"{'Screen Size': '15.6 inches', 'Max Screen Res...",https://www.amazon.com/Acer-Ultra-Thin-Lightwe...,"[{'name': 'Amazon Customer', 'rating': '5.0 ou..."
4,https://www.amazon.com/ASUS-Chromebook-Clamshe...,B07VT254P6,,3.9 out of 5 stars,43,48,"{'Screen Size': '14 inches', 'Max Screen Resol...",https://www.amazon.com/ASUS-Chromebook-Clamshe...,"[{'name': 'Nova', 'rating': '4.0 out of 5 star..."
5,https://www.amazon.com/Dell-Inspiron-Touchscre...,B07P4LFDT7,$399.61,3.5 out of 5 stars,57,66,"{'Screen Size': '14 inches', 'Processor': '3.9...",https://www.amazon.com/Dell-Inspiron-Touchscre...,"[{'name': 'Tim Butterfield', 'rating': '1.0 ou..."
6,https://www.amazon.com/Lenovo-Chromebook-Media...,B07GLV1VC7,,3.9 out of 5 stars,88,100,"{'Screen Size': '14 inches', 'Max Screen Resol...",https://www.amazon.com/Lenovo-Chromebook-Media...,"[{'name': 'Jpags', 'rating': '4.0 out of 5 sta..."
7,https://www.amazon.com/HP-Touch-Screen-Noteboo...,B07S6ZS35Q,$357.48,3.7 out of 5 stars,82,100,"{'Screen Size': '15.6 inches', 'Screen Resolut...",https://www.amazon.com/HP-Touch-Screen-Noteboo...,"[{'name': 'Mike Donnell ', 'rating': '2.0 out ..."
8,https://www.amazon.com/HP-14-inch-Celeron-Wind...,B07WMDV7CW,$229.43,3.9 out of 5 stars,25,29,"{'Screen Size': '14 inches', 'Max Screen Resol...",https://www.amazon.com/HP-14-inch-Celeron-Wind...,"[{'name': 'Nerd Alert', 'rating': '2.0 out of ..."
9,https://www.amazon.com/Lenovo-Ideapad-i5-9300H...,B07VC55LF5,$624.99,4.1 out of 5 stars,42,58,"{'Screen Size': '15.6 inches', 'Screen Resolut...",https://www.amazon.com/Lenovo-Ideapad-i5-9300H...,"[{'name': 'Maverick Vailladin', 'rating': '5.0..."


### Formatting

In [6]:
# The following function takes a numeric string (<class 'str'>), removes any comma or dollar characters ("," "$") and
# returns a numeric float value (<class 'float'>):
def format_cleaner(val):
    return float(val.replace(',','').replace('$',''))

# After loading the data from the json file, every «review» entry is a dictionary type value that is 
# composed of several fields: customer name, rating, date, title, and the text of the review itself.
# Here we extract the relevant details (title and the text of the review itself) and create 
# a new simplified dataframe with just the columns we need to facilitate the handling 
title      = []
text       = []
title_text = []
rating     = []
brand      = []
price      = []
length_title_text  = []

for i in range(my_reviews.shape[0]):
    for j in range(len(my_reviews['reviews'][i])):
        title_val  = my_reviews['reviews'][i][j]['title']
        text_val   = my_reviews['reviews'][i][j]['review_text']
        title_text_val = title_val+' '+text_val
        
        rating_val = my_reviews['reviews'][i][j]['rating']
        rating_val = rating_val.split()
        rating_val = format_cleaner(rating_val[0])

        brand_value = my_reviews['tech_details'][i]['Brand Name'].title()
        price_value = my_reviews['price'][i]
        price_value = round(format_cleaner(price_value)) if pd.notnull(price_value) else price_value

        title.append(title_val)
        text.append(text_val)
        title_text.append(title_text_val)
        rating.append(rating_val)
        brand.append(brand_value)
        price.append(price_value)
        length_title_text.append(len(title_text_val))

my_reviews = pd.DataFrame({'title'             : title,
                           'text'              : text,
                           'title_text'        : title_text,
                           'length_title_text' : length_title_text,
                           'rating'            : rating,
                           'brand'             : brand,
                           'price'             : price})

my_reviews

Unnamed: 0,title,text,title_text,length_title_text,rating,brand,price
0,An everyday person's review,I have a few friends who work in IT and will r...,An everyday person's review I have a few frien...,2846,5.0,Hp,690.0
1,Fantastic performance for the price! Perfect ...,I went without a laptop for probably 10 years ...,Fantastic performance for the price! Perfect ...,3254,5.0,Hp,690.0
2,Powerful little dude,After years of dealing with electronics that w...,Powerful little dude After years of dealing wi...,1585,4.0,Hp,690.0
3,Some Nice Bells & Whistles,I was looking to replace my chromebook and wan...,Some Nice Bells & Whistles I was looking to re...,954,2.0,Hp,690.0
4,Great laptop unless you are a serious gamer.,"Quality laptop with solid performance specs, t...",Great laptop unless you are a serious gamer. Q...,2760,4.0,Hp,690.0
...,...,...,...,...,...,...,...
2926,Small and cute light weight,Nice,Small and cute light weight Nice,32,4.0,Acer,210.0
2927,Decent cheap laptop,Works like it should,Decent cheap laptop Works like it should,40,4.0,Acer,210.0
2928,The battery charger it's bad quilty,Used for school,The battery charger it's bad quilty Used for s...,51,4.0,Acer,210.0
2929,Not good enough,Was hoping for a more responsive touch screen ...,Not good enough Was hoping for a more responsi...,139,1.0,Acer,210.0


## Exploratory Data Analysis

In [None]:
# Describe
my_reviews.describe()

In [10]:
# Visulizing the data by ploting a histogram

# https://plotly.com/python/distplot/#combined-statistical-representations-with-pxhistogram
# https://plotly.com/python/histograms/#choosing-the-number-of-bins
# https://plotly.github.io/plotly.py-docs/generated/plotly.express.histogram.html
# https://plotly.github.io/plotly.py-docs/generated/plotly.graph_objects.Histogram.html

# This allows use plotly in Junype-notebook
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import plotly.express as px
import plotly.graph_objs as go

def review_length_histogram(df,column_rating,column_length,labeled=None):
    if labeled == None:
        df = df[df[column_rating] != 3]
        df['label'] = ['positive' if rating > 3 else 'negative' for rating in df[column_rating] ]

    df = df.sort_values(by=['label'], ascending=False)
    n_reviews = len(df)
    n_pos = len(df[df['label'] == 'positive'])
    n_neg = len(df[df['label'] == 'negative'])
    n_pos_percent = round((n_pos*100)/n_reviews)
    n_neg_percent = round((n_neg*100)/n_reviews)

    fig = px.histogram(df, x=column_length, color="label",  barmode="overlay", nbins=150, opacity=0.75, 
                    marginal="box",)  # or violin, rug)
    fig.update_layout(
            margin=go.layout.Margin(l=0, r=40, t=0, b=0), xaxis_tickangle=0,
            uniformtext_minsize=8, uniformtext_mode='hide', coloraxis_showscale=False, height=500, width=1000,
            yaxis=dict(
                title='Count',
                titlefont_size=15,
                tickfont_size=15,
            ),
            xaxis=dict(
                title='Review length (number of characters)',
                titlefont_size=15,
                tickfont_size=15,
            ),
            font=dict(
                size=15,
                color="black"
            ),
            legend=dict(
                y=0.7,
                x=0.81,
                title='<span style="font-weight:bold; font-size:18px">{}</span> <span style="font-weight:normal; font-size:18px"> customer reviews</span><br /><span style="color:blue">   {} ({}%)</span><br /><span style="color:red">   {} ({}%)</span>'.format(n_reviews, n_pos, n_pos_percent, n_neg, n_neg_percent),
                font=dict(
                    size=15,
                ),
            ),
        )
    return fig


fig = review_length_histogram(my_reviews,'rating','length_title_text')
fig

## Text Pre-processing
* **Removing punctuation:**
 * Punctuation: We will remove all punctuation char found the «string» library.
 
* **Removing stopwords:**
 * Our stopwords will be composed by:
  - The common stopwords defined in the nltk library 
  - Some particular stopwords related to our data:
    * Brand names: There is no point in analyzing brand names. For instance, in a Lenovo review, the customer will use the word ``Lenovo'' many times, but this fact does not contribute anything to the analysis. 
    * Laptop synonyms: laptop, computer, machine, etc.
    * Some no-official contractions that are not in the nltk library: Im dont Ive, etc.

* **Tokenization**


In [11]:
# Defining our stopwords list:
import nltk
import string
nltk.data.path.append('/home/adelo/.nltk/nltk_data')
from nltk.corpus import stopwords

stopwords_brands_additionals = ['computer','computers','laptop','laptops','thing','things','machine','machines','im','dont','ive']
stopwords_total  = stopwords.words('english') + stopwords_brands_additionals

# The following function takes a string and an optional argument «tokenize»:
# * It removes punctuation and stopwords from the string entered
# * If the «tokenize» argument if not specified, the string will be tokenized so it will return 
#   a list of the word without punctuation or stopwords
# * If a tokenize argument is specified, the string will NOT be tokenized, so it will return
#   a string without punctuation or stopwords
def pre_processing(texto,tokenize=None):
    # Removing punctuation:
    text_process = ''.join([ char for char in texto if char not in string.punctuation ])
    # Removing Stopwords:
    text_process = ' '.join([ word for word in text_process.split() if word.lower() not in stopwords_total ])
    if tokenize == None:
        return [word for word in text_process.split()]
    else:
        return text_process
    

# Example of applying the function «pre_processing()»:
display(my_reviews['title_text'].head())
display(my_reviews['title_text'].head().apply(lambda val: pre_processing(val,'no_tokenize')))

# Here is how we would apply the function «pre_processing()» to a column over the entire dataframe.
# However, we won't do that in this stage because we need a raw text for the Sentiment Analysis
# my_reviews['title']      = my_reviews['title'].apply(lambda val: pre_processing(val,'no_tokenize'))
# my_reviews['text']       = my_reviews['text'].apply(lambda val: pre_processing(val,'no_tokenize'))
# my_reviews['title_text'] = my_reviews['title_text'].apply(lambda val: pre_processing(val,'no_tokenize'))
# display(my_reviews)

0    An everyday person's review I have a few frien...
1    Fantastic performance for the price!  Perfect ...
2    Powerful little dude After years of dealing wi...
3    Some Nice Bells & Whistles I was looking to re...
4    Great laptop unless you are a serious gamer. Q...
Name: title_text, dtype: object

0    everyday persons review friends work rant rave...
1    Fantastic performance price Perfect average us...
2    Powerful little dude years dealing electronics...
3    Nice Bells Whistles looking replace chromebook...
4    Great unless serious gamer Quality solid perfo...
Name: title_text, dtype: object

## Performing the Lexicon-based Sentiment Analysis
* We are performing a Lexicon-based Sentiment Analysis using two popular Python libraries: **TextBlob** and **Vader Sentiment**.

In [12]:
from textblob import TextBlob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

# Sentiment polarity and Subjectivity using TextBlob
my_reviews['polarity_title_text_textblob']     = [round(TextBlob(review).sentiment.polarity, 5)     for review in my_reviews['title_text']]
my_reviews['subjectivity_title_text_textblob'] = [round(TextBlob(review).sentiment.subjectivity, 5) for review in my_reviews['title_text']]

# Sentiment polarity using Vader Sentiment
my_reviews['polarity_title_text_vader']        = [analyser.polarity_scores(review)['compound']      for review in my_reviews['title_text']]
my_reviews

Unnamed: 0,title,text,title_text,length_title_text,rating,brand,price,polarity_title_text_textblob,subjectivity_title_text_textblob,polarity_title_text_vader
0,An everyday person's review,I have a few friends who work in IT and will r...,An everyday person's review I have a few frien...,2846,5.0,Hp,690.0,0.23604,0.43632,0.9968
1,Fantastic performance for the price! Perfect ...,I went without a laptop for probably 10 years ...,Fantastic performance for the price! Perfect ...,3254,5.0,Hp,690.0,0.29919,0.58010,0.9992
2,Powerful little dude,After years of dealing with electronics that w...,Powerful little dude After years of dealing wi...,1585,4.0,Hp,690.0,0.28183,0.65478,0.9956
3,Some Nice Bells & Whistles,I was looking to replace my chromebook and wan...,Some Nice Bells & Whistles I was looking to re...,954,2.0,Hp,690.0,0.33333,0.51000,0.9118
4,Great laptop unless you are a serious gamer.,"Quality laptop with solid performance specs, t...",Great laptop unless you are a serious gamer. Q...,2760,4.0,Hp,690.0,0.24851,0.51235,0.9980
...,...,...,...,...,...,...,...,...,...,...
2926,Small and cute light weight,Nice,Small and cute light weight Nice,32,4.0,Acer,210.0,0.31250,0.77500,0.7003
2927,Decent cheap laptop,Works like it should,Decent cheap laptop Works like it should,40,4.0,Acer,210.0,0.28333,0.68333,0.3612
2928,The battery charger it's bad quilty,Used for school,The battery charger it's bad quilty Used for s...,51,4.0,Acer,210.0,-0.70000,0.66667,-0.5423
2929,Not good enough,Was hoping for a more responsive touch screen ...,Not good enough Was hoping for a more responsi...,139,1.0,Acer,210.0,0.01000,0.54000,0.6691


In [13]:
# This function takes the «reviews dataframe with sentiment polarities» and a «sentiment_library» argument:
#  * Removes the neutral reviews based on the customer rating (an Amazon customer
#    rating of 3 is considered a neutral review),
#  * Labels the Sentiment polarity as «positive» or «negative» depending on the polarity score
#    given by TextBlob or Vader.
#  * Return a new dataframe with 2 columns: 
#     - The true sentiment given by the «Amazon customer rating» (y_true)
#     - The predicted Sentiment given by the Sentiment Analysis library (y_pred)

def getSentimentAnalysis(reviews_sent_polarities, sentiment_library):

    if sentiment_library == 'textblob':
        polarity_columns = 'polarity_title_text_textblob'
        # Here we remove the neutral entries
        sent_analysis_df = reviews_sent_polarities[ (reviews_sent_polarities['rating'] != 3) ][['rating',polarity_columns]]
        # display(sent_analysis_df)
        # Because we have alredy removed the neutral entris and so there are no «rating = 3» or «polarity_title_text_textblob = 0», we can select the «positive» and «negative» polarities this way:
        sent_analysis_df['rating'] = ['positive' if polarity > 3 else 'negative' for polarity in sent_analysis_df['rating'].tolist()]
        sent_analysis_df[polarity_columns] = ['positive' if polarity > 0 else 'negative' for polarity in sent_analysis_df[polarity_columns].tolist()]
        # display(sent_analysis_df)

    elif sentiment_library == 'vader':
        polarity_columns = 'polarity_title_text_vader'
        sent_analysis_df = reviews_sent_polarities[ (reviews_sent_polarities['rating'] != 3) ][['rating',polarity_columns]]
        # display(sent_analysis_df)
        sent_analysis_df['rating'] = ['positive' if polarity > 3 else 'negative' for polarity in sent_analysis_df['rating'].tolist()]
        sent_analysis_df[polarity_columns] = ['positive' if polarity > 0 else 'negative' for polarity in sent_analysis_df[polarity_columns].tolist()]
        # display(sent_analysis_df)

    else:
        polarity_columns = 'subjectivity_title_text_textblob'
        sent_analysis_df = reviews_sent_polarities[ (reviews_sent_polarities['rating'] != 3) ][['rating',polarity_columns]]
        # display(sent_analysis_df)
        sent_analysis_df['rating'] = ['positive' if polarity > 3 else 'negative' for polarity in sent_analysis_df['rating'].tolist()]
        sent_analysis_df[polarity_columns] = ['objective' if subjectivity > 0.5 else 'subjective' for subjectivity in sent_analysis_df[polarity_columns].tolist()]
        # display(sent_analysis_df)

    return sent_analysis_df

my_reviews_sent_textblob = getSentimentAnalysis(my_reviews, 'textblob')
my_reviews_sent_textblob

Unnamed: 0,rating,polarity_title_text_textblob
0,positive,positive
1,positive,positive
2,positive,positive
3,negative,positive
4,positive,positive
...,...,...
2926,positive,positive
2927,positive,positive
2928,positive,negative
2929,negative,positive


## Evaluation of the Lexicon-based Sentiment Analysis 

### Confusion Matrix

In [14]:
# https://plotly.com/python/annotated-heatmap/#simple-annotated-heatmap
# https://stackoverflow.com/questions/60860121/plotly-how-to-make-an-annotated-confusion-matrix-using-a-heatmap

import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix

# This function takes:
# * y_true: The Amazon customer rating («positive» / «negative»)
# * y_pred: the Sentiment polarity («positive» / «negative») predicted
# Return nice visualization of the Confusion Matrix using a Heatmap chart
def ConfusionMatrixHeatmap(y_true, y_pred):
    cm = pd.DataFrame(
        confusion_matrix(y_true, y_pred, labels=['positive','negative']), 
        index   = ['Actual:positive',    'Actual:negative'   ], 
        columns = ['Predicted:positive', 'Predicted:negative'],
    )

    TP = cm['Predicted:positive']['Actual:positive']
    FN = cm['Predicted:negative']['Actual:positive']
    FP = cm['Predicted:positive']['Actual:negative']
    TN = cm['Predicted:negative']['Actual:negative']

    cm_anotations = [['TP<br />{}'.format(TP),'FN<br />{}'.format(FN)],
                    ['FP<br />{}'.format(FP),'TN<br />{}'.format(TN)]]
    cm_colors = [[0,1],[1,0]]
    fig = ff.create_annotated_heatmap(
        cm_colors, 
        x=['Positive', 'Negative'], 
        y=['Positive', 'Negative'], 
        annotation_text=cm_anotations, 
        colorscale = [[0, '#6495ED'], [1, '#b35050']],
    )
    fig['layout']['yaxis']['autorange'] = "reversed"
    fig.update_layout(
        yaxis=dict(
                title='Current  value',
                titlefont_size=20,
                tickfont_size=20,
                color="black",
        ),
        xaxis=dict(
                title='Predicted  value',
                titlefont_size=20,
                tickfont_size=20,
                color="black",
        ),
        font=dict(
            size=25,
        )
    )
    fig.update_layout(margin=dict(t=1, b=1, r=1, l=1), yaxis_tickangle=-90)

    return fig

In [15]:
my_reviews_sent_textblob = getSentimentAnalysis(my_reviews, 'textblob')
my_reviews_sent_vader    = getSentimentAnalysis(my_reviews, 'vader')

fig_conf_matrix_textblob = ConfusionMatrixHeatmap(my_reviews_sent_textblob['rating'], my_reviews_sent_textblob['polarity_title_text_textblob'])
display(fig_conf_matrix_textblob)
fig_conf_matrix_vader = ConfusionMatrixHeatmap(my_reviews_sent_vader['rating'], my_reviews_sent_vader['polarity_title_text_vader'])
display(fig_conf_matrix_vader)

#### Confusion Matrix pie chart

In [16]:
def confusion_matrix_pie_chart(y_true, y_pred):
    cm = pd.DataFrame(
        confusion_matrix(y_true, y_pred, labels=['positive','negative']), 
        index   = ['Actual:positive',    'Actual:negative'   ], 
        columns = ['Predicted:positive', 'Predicted:negative'],
    )
    
    TP = cm['Predicted:positive']['Actual:positive']
    FN = cm['Predicted:negative']['Actual:positive']
    FP = cm['Predicted:positive']['Actual:negative']
    TN = cm['Predicted:negative']['Actual:negative']
    
    values = [TP,    TN,   FP,   FN]
    labels = ['TP', 'TN', 'FP', 'FN']

    fig = go.Figure(data=[go.Pie(labels=labels, values=values, direction='clockwise', sort=False )])
    fig.update_layout(
        margin=go.layout.Margin(l=0, r=0, t=10, b=10),
        showlegend=False,
        # annotations=[dict(text=title1, x=0.5, y=0.5, font_size=title1_size, showarrow=False)]
    )
    fig.update_traces(textinfo='label+percent', textfont_size=18, 
                      marker=dict(colors=['#6495ED','#007bff','#b35050','#9c2828']) )
    
    return fig

In [17]:
my_reviews_sent_textblob = getSentimentAnalysis(my_reviews, 'textblob')
my_reviews_sent_vader    = getSentimentAnalysis(my_reviews, 'vader')

fig = confusion_matrix_pie_chart(my_reviews_sent_textblob['rating'], my_reviews_sent_textblob['polarity_title_text_textblob'])
display(fig)
fig = confusion_matrix_pie_chart(my_reviews_sent_vader['rating'], my_reviews_sent_vader['polarity_title_text_vader'])
display(fig)

### Classification report

In [18]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
# Note that in binary classification, recall of the positive class is also known as "sensitivity"; recall of the negative class is "specificity".
   
import numpy as np
import plotly.figure_factory as ff

from sklearn.metrics import classification_report

def classif_report_heatmap(y_true, y_pred):
   print('\nClassification report:')
   
   classif_report_string = classification_report(y_true, y_pred, labels=['positive','negative'], output_dict=False)
   classif_report_dic    = classification_report(y_true, y_pred, labels=['positive','negative'], output_dict=True)

   classif_report_colors = np.array([[
                  round(classif_report_dic['positive']['precision'],2),
                  classif_report_dic['positive']['recall'],
                  classif_report_dic['positive']['f1-score'],
                  .0
               ],
               [
                  classif_report_dic['negative']['precision'],
                  classif_report_dic['negative']['recall'],
                  classif_report_dic['negative']['f1-score'],
                  .0
               ],
               [
                  .0,
                  .0,
                  .0,
                  .0,
               ],
               [
                  .0,
                  .0,
                  classif_report_dic['accuracy'],
                  .0,
               ],
               [
                  classif_report_dic['macro avg']['precision'],
                  classif_report_dic['macro avg']['recall'],
                  classif_report_dic['macro avg']['f1-score'],
                  .0
               ],
               [
                  classif_report_dic['weighted avg']['precision'],
                  classif_report_dic['weighted avg']['recall'],
                  classif_report_dic['weighted avg']['f1-score'],
                  .0
               ]])

   classif_report_colors = np.round(np.array(classif_report_colors), 2)

   classif_report_anotations = [ [str(value) for value in fila] for fila in classif_report_colors]

   classif_report_anotations[2]    = [' ' for a in classif_report_anotations[2]]
   classif_report_anotations[3][0] =  ' '
   classif_report_anotations[3][1] =  ' '

   classif_report_anotations[0][3] = classif_report_dic['positive']['support']
   classif_report_anotations[1][3] = classif_report_dic['negative']['support']
   classif_report_anotations[3][3] = classif_report_dic['macro avg']['support']
   classif_report_anotations[4][3] = classif_report_dic['macro avg']['support']
   classif_report_anotations[5][3] = classif_report_dic['weighted avg']['support']

   fig = ff.create_annotated_heatmap(
      classif_report_colors, 
      x=['precision','recall','f1-score','support'],
      y=['positive','negative',' ','accuracy','macro avg','weighted avg'], 
      annotation_text=classif_report_anotations, 
      colorscale=[[ 0,  'white'],
                  [.01, 'red'],
                  [.3,  '#f75454'],
                  [.5,  '#eb8d8d'],
                  [.7,  '#a5bdfa'],
                  [.8,  '#779bf7'],
                  [.9,  '#366eff'],
                  [ 1,  '#0048ff']]
   )
   fig['layout']['yaxis']['autorange'] = "reversed"
   fig.update_layout(
      yaxis=dict(
               titlefont_size=25,
               tickfont_size=18,
               color="black",
      ),
      xaxis=dict(
               titlefont_size=25,
               tickfont_size=18,
               color="black",
      ),
      font=dict(
         size=20,
      ),
      margin=dict(t=20, b=20, r=20, l=20)
   )
   fig['data'][0]['showscale'] = True
   return fig

In [19]:
my_reviews_sent_textblob = getSentimentAnalysis(my_reviews, 'textblob')
my_reviews_subj_textblob = getSentimentAnalysis(my_reviews, 'subjective')
my_reviews_sent_vader    = getSentimentAnalysis(my_reviews, 'vader')

fig = classif_report_heatmap(my_reviews_sent_textblob['rating'], my_reviews_sent_textblob['polarity_title_text_textblob'])
display(fig)
fig = classif_report_heatmap(my_reviews_sent_vader['rating'], my_reviews_sent_vader['polarity_title_text_vader'])
display(fig)


Classification report:



Classification report:


### A common visualization of sentiment polarities

In [20]:
import plotly.graph_objects as go

def sentimetsDonutChart(y, labels, title1, title1_size, colors):
    
    if labels[0] == 'Pos.':
        n_pos = len(y[y == 'positive'])
        n_neg = len(y[y == 'negative'])
    else:
        n_pos = len(y[y == 'objective'])
        n_neg = len(y[y == 'subjective'])
            
    values = [n_pos, n_neg]

    fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.5)])
    fig.update_layout(
        margin=go.layout.Margin(l=0, r=0, t=10, b=10),
        showlegend=False,
        annotations=[dict(text=title1, x=0.5, y=0.5, font_size=title1_size, showarrow=False)])
    fig.update_traces(textinfo='label+percent', textfont_size=18, 
                      marker=dict(colors=colors) )
    return fig

In [21]:
my_reviews_sent_textblob = getSentimentAnalysis(my_reviews, 'textblob')
my_reviews_subj_textblob = getSentimentAnalysis(my_reviews, 'subjective')
my_reviews_sent_vader    = getSentimentAnalysis(my_reviews, 'vader')

print('True polarities given by the Amazon rating score')
fig1 = sentimetsDonutChart(my_reviews_sent_textblob['rating'], ['Pos.','Neg.'], 'Customer<br />rating', 20, ['#007bff','#9c2828'] )
display(fig1)

print('TextBlob Sentiment Polarities')
fig2 = sentimetsDonutChart(my_reviews_sent_textblob['polarity_title_text_textblob'], ['Pos.','Neg.'], 'Polarity', 20, ['#6495ed','#b35050'] )
display(fig2)

print('TextBlob Sentiment subjectivity')
fig3 = sentimetsDonutChart(my_reviews_subj_textblob['subjectivity_title_text_textblob'], ['Obj.','Subj.'], 'Subjectivity', 20, ['#6868a3','#9cba95'] )
display(fig3)

print('Vader Sentiment Polarities')
fig4 = sentimetsDonutChart(my_reviews_sent_vader['polarity_title_text_vader'], ['Pos.','Neg.'], 'Polarity', 20, ['#6495ed','#b35050'] )
display(fig4)

True polarities given by the Amazon rating score


TextBlob Sentiment Polarities


TextBlob Sentiment subjectivity


Vader Sentiment Polarities


## Emotion analysis using the NRC Lexicon

In [22]:
# https://plotly.com/python/bar-charts/https://plotly.com/python/bar-charts/

import numpy as np
import plotly.graph_objs as go

def emotionsBarChart(text):
    filepath = ('NRC-Sentiment-Emotion-Lexicons/'
                'NRC-Emotion-Lexicon-v0.92/'
                'NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')
            
    lexiEmo_df0  = pd.read_csv(filepath,
                            names=["word", "emotion", "association"],
                            sep='\t')

    lexiEmo_df  = lexiEmo_df0.pivot(index='word',
                                    columns='emotion',
                                    values='association').reset_index()

    counterEmo = pd.Series(data=np.zeros(11).astype(int),index=lexiEmo_df.columns)
    counterEmo.drop(index=['word'],inplace=True)

    text_list = [word for word in text.split()]

    for palabra in text_list:
        if palabra in lexiEmo_df['word'].tolist():
            i = lexiEmo_df.index[lexiEmo_df['word'] == palabra].tolist()
            vectorEmo = lexiEmo_df.iloc[i[0]]
            vectorEmo.drop(index=['word'],inplace=True)
            counterEmo = counterEmo + vectorEmo

    counterEmo_df = pd.DataFrame(counterEmo)
    counterEmo_df = counterEmo_df.rename(columns={0:'count'})
    counterEmo_df = counterEmo_df.reset_index()

    counterEmo_df = counterEmo_df.sort_values(by=['count'],ascending=True)

    fig = go.Figure([
        go.Bar(
            x=counterEmo_df['count'], y=counterEmo_df['emotion'], orientation='h', text='count'
        )
    ])
    fig.update_layout(
        margin=go.layout.Margin(l=5, r=5, t=5, b=5),
        uniformtext_minsize=8, uniformtext_mode='hide',
        font=dict(
            size=17,
            color="black"
        )
    )
    fig.update_yaxes(automargin=True)
    fig.update_traces(marker_color='#007bff')

    return fig

### Emotions in HP Laptops reviews

In [23]:
brand  = ['Hp']
reviews_text_brand = ' '.join(my_reviews.query('brand in @brand')['title_text'].tolist())

emoChart = emotionsBarChart(reviews_text_brand)
emoChart

### Emotions in Lenovo Laptops reviews

In [24]:
brand  = ['Lenovo']
reviews_text_brand = ' '.join(my_reviews.query('brand in @brand')['title_text'].tolist())

emoChart = emotionsBarChart(reviews_text_brand)
emoChart

# Supervised Machine Learning Model for Sentiment Analysis

In [25]:
# This function takes our raw review dataframe, removes the neutral reviews
# and returns a simplified dataframe with only the columns we need for this phase:
#  * A column named «review» that is the original «title_tex» columns in our raw dataframe
#  * A column named «label»  that is the labeled columns («positive» / «negative») of the 
#    original «rating» column of our raw dataframe
#  * A column named «length» that is the original «title_text_length» in our raw dataframe

def labeling_data(data, column_review, column_rating, column_length=None):
    # Let's create a simplified data frame containing only the columns we need for this part
    
    if column_length == None:
        data_labeled = data[[column_review,column_rating]]
    else:
        data_labeled = data[[column_review,column_rating,column_length]]

    # We will get rid of neutral reviews
    data_labeled = data_labeled[data_labeled[column_rating] != 3]

    # Let's name our «rating» columns as «label» and give it «positive» or «negative» value depending on the customer 
    # rating:
    data_labeled['label'] = [ 'positive' if rating > 3 else 'negative' for rating in data_labeled[column_rating] ]
    
    if column_length == None:
        data_labeled = data_labeled.rename(columns={column_review:'review'})
        data_labeled = data_labeled[['review','label']]
    else:
        data_labeled = data_labeled.rename(columns={column_review:'review',column_length: 'length'})
        data_labeled = data_labeled[['review','label','length']]

    return data_labeled


my_reviews_ml = labeling_data(my_reviews, 'title_text', 'rating', 'length_title_text')
my_reviews_ml

Unnamed: 0,review,label,length
0,An everyday person's review I have a few frien...,positive,2846
1,Fantastic performance for the price! Perfect ...,positive,3254
2,Powerful little dude After years of dealing wi...,positive,1585
3,Some Nice Bells & Whistles I was looking to re...,negative,954
4,Great laptop unless you are a serious gamer. Q...,positive,2760
...,...,...,...
2926,Small and cute light weight Nice,positive,32
2927,Decent cheap laptop Works like it should,positive,40
2928,The battery charger it's bad quilty Used for s...,positive,51
2929,Not good enough Was hoping for a more responsi...,negative,139


## A Supervised Model using «my_reviews» dataset
* We will first create a model using our small Amazon Reviews Dataset («my_reviews»)

### Building a Supervised Naive Bayes Model step by step
* We will first build the model step by step and performing a series of test so we can understand the process

In [26]:
# Text pre-processing and Tokenization
# ====================================
# We will use the «pre_processing» function created above; «pre_processing» will be passed as an argument
# to the CountVectorizer object



# Labeling the data
# =================
# Labeling the data as 'positive' or 'negative' depending on the customer rating (amazon rating score [1-5]) 
# using the function defined above
my_reviews_ml = labeling_data(my_reviews, 'title_text', 'rating')
my_reviews_ml



# Data splitting
# ==============
from sklearn.model_selection import train_test_split
train, test = train_test_split(my_reviews_ml, test_size=.3, random_state=0)
# display(train)
# display(test)



# Creating a «Bag-of-Words
# ========================
from sklearn.feature_extraction.text import CountVectorizer

#  This create a «Bag-of-Words (bow) transformed object» (It is not the resulting DTM yet)
# There are a lot of arguments and parameters that can be passed to the CountVectorizer. In this case we will just specify the analyzer to be our own previously defined «tokenize» function:
# Might take a while...

# This creates a bow transformed object
train_bow_transformer = CountVectorizer(analyzer=pre_processing).fit(train['review'])

## Let's performe some test in the bow transformed object
##* Print total number of vocab words:
#print(len(train_bow_transformer.vocabulary_))

##* Let's take one review and get its bag-of-words counts as a vector, putting to use our new train_bow_transformer:
#train_review4 = train['review'][3]
#print(train_review4)

##* Now let's see its vector representation:
#train_bow4 = train_bow_transformer.transform([train_review4])
#print(train_bow4 )
#print(train_bow4.shape)

##* Let's go ahead and check and confirm which ones appear twice:
#print(train_bow_transformer.get_feature_names()[15989])

# Now we can use «.transform» on our «Bag-of-Words (bow) transformed object» and transform the entire DataFrame of reviews.
train_bow = train_bow_transformer.transform(train['review'])

## Let's go ahead and check out how the bag-of-words counts for the entire review corpus is a large, sparse matrix:
#print('Shape of Sparse Matrix: ',        train_bow.shape)
#print('Amount of Non-Zero occurences: ', train_bow.nnz)

#sparsity = (100.0 * train_bow.nnz / (train_bow.shape[0] * train_bow.shape[1]))
#print('sparsity: {}'.format(round(sparsity)))



# Term weighting and Normalization using TF-IDF
# =============================================

#* Using TfidfTransformer method from Scikit-learn to compute the TF-IDF
#  Term weighting and Normalization can be done with TF-IDF, using scikit-learn's TfidfTransformer

from sklearn.feature_extraction.text import TfidfTransformer

# This creates a train_tfidf_transformer object
train_tfidf_transformer = TfidfTransformer().fit(train_bow)

## Let's performe some test in the train_tfidf_transformer object
##train_tfidf4 = train_tfidf_transformer.transform(train_bow4)
##print(train_tfidf4)

## We'll go ahead and check what is the IDF (inverse document frequency) of the word "looking"?
#print(train_tfidf_transformer.idf_[train_bow_transformer.vocabulary_['looking']])

# To transform the entire bag-of-words corpus into TF-IDF corpus:
train_tfidf = train_tfidf_transformer.transform(train_bow)
#print(train_tfidf.shape)



# Training the model
# ==================
# Naive Bayes classifier using scikit-learn

from sklearn.naive_bayes import MultinomialNB

# Creating the naive bayes model
naive_bayes_model = MultinomialNB().fit(train_tfidf, train['label'])

## Let's try classifying our single random review and checking how we do:
#print('Predicted:', naive_bayes_model.predict(train_tfidf4)[0])
#print('Expected:',  train.label[3])



# Let's classifying our train data
# ================================
train_predictions = naive_bayes_model.predict(train_tfidf)


# Let's classifying our test data
# ===============================
test_bow = train_bow_transformer.transform(test['review'])
test_tfidf_transformer = TfidfTransformer().fit(test_bow)
test_tfidf = test_tfidf_transformer.transform(test_bow)
test_predictions = naive_bayes_model.predict(test_tfidf)


### Building a Supervised Naive Bayes Model using «Pipeline»
* To simplify the process above, we can use «Pipeline»

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


# Preparint the dataset
my_reviews_ml = labeling_data(my_reviews, 'title_text', 'rating')


# Data spliting
x = my_reviews_ml['review']
y = my_reviews_ml['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)


# Creating a Naive Bayes classifier Pipeline
nbayes_model_my_reviews = Pipeline([
    ('bow',        CountVectorizer(analyzer=pre_processing)),  # Creating a «Bag-of-Words and Text pre-processing and Tokenization
    ('tfidf',      TfidfTransformer()), # Term weighting and Normalization using TF-IDF
    ('classifier', MultinomialNB()),    # Training Naive Bayes classifier using scikit-learn
])


# Creating the Naive Bayes classifier
nbayes_model_my_reviews.fit(x_train,y_train)


# y_train_pred = nbayes_model_my_reviews.predict(x_train)
#
# display(ConfusionMatrixHeatmap(y_train, y_train_pred))
# display(confusion_matrix_pie_chart(y_train, y_train_pred))
# display(classif_report_heatmap(y_train, y_train_pred))

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function pre_processing at 0x7fa7494449e0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

### Classifying our test data

In [28]:
# Using the model created to classify our test data
y_test_pred  = nbayes_model_my_reviews.predict(x_test)
y_test_pred

array(['positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'negative', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positi

### Model evaluation

In [29]:
display(ConfusionMatrixHeatmap(y_test, y_test_pred))
display(confusion_matrix_pie_chart(y_test, y_test_pred))
display(classif_report_heatmap(y_test, y_test_pred))


Classification report:


## A Supervised Model using the «reviews_wang» dataset
* In this second part we will created a Supervised Naive Bayes model using an Amazon review dataset from [Wang et al (2010)]
  
  The dataset has been taken from http://sifaka.cs.uiuc.edu/~wang296/Data/index.html
    * We have taken the dataset named «Six Categories of Amazon Product Reviews» in the above link.
    * To simplify the data, from that dataset, we have only taken the Laptop reviews
    * There is a total of 40,762 Laptop reviews

### Data preparation

#### Loading the data

In [36]:
# https://stackoverflow.com/questions/30539679/python-read-several-json-files-from-a-folder
    
import os, json
import pandas as pd

path_to_json = '/home/adelo/1-system/1-disco_local/1-mis_archivos/1-pe/1-ciencia/1-computer_science_an_IT/2-data_science/1-Amazon_Laptops_Dashboard/0AmazonLaptopsDashboard/DataAnalysis/AmazonReviews-Hongning_Wang_2010/laptops/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

reviews_wang = pd.DataFrame(columns=['reviews'])

for index, js in enumerate(json_files):
    with open(os.path.join(path_to_json, js)) as json_file:
        json_text = json.load(json_file)
        reviews   = json_text['Reviews']
        reviews_wang.loc[index] = [reviews]

print(reviews_wang)

                                                reviews
0     [{'Title': 'HP Laptop 15-d035dx "MINUS 10 Star...
1     [{'Title': 'So disappointed :(', 'Author': 'Ja...
2     [{'Title': 'product', 'Author': 'imerialadrian...
3                                                    []
4     [{'Title': 'Good laptop', 'Author': 'Felix', '...
...                                                 ...
2468  [{'Title': 'Speed Demon', 'Author': 'K. Young ...
2469  [{'Title': 'Not so fast*****', 'Author': 'Aunt...
2470  [{'Title': 'I finally see the point of the Mac...
2471                                                 []
2472  [{'Title': 'Good business laptop', 'Author': '...

[2473 rows x 1 columns]


#### Formating 

In [37]:
title      = []
text       = []
title_text = []
rating     = []
length     = []
for lista in reviews_wang['reviews']:
    if lista != 0:
        for dic in lista:
            title_value      = dic['Title']
            text_value       = dic['Content']
            title_text_value = '{} {}'.format(title_value,text_value)
            rating_value     = dic['Overall']
            length_value     = len(title_text_value)

            title.append(title_value)
            text.append(text_value)
            title_text.append(title_text_value)
            rating.append(rating_value)
            length.append(length_value)

reviews_wang = pd.DataFrame({'title'      : title,
                             'text'       : text,
                             'title_text' : title_text,
                             'rating'     : rating,
                             'length'     : length})

reviews_wang['rating'] = reviews_wang['rating'].apply(lambda val: round(format_cleaner(val)))
reviews_wang

Unnamed: 0,title,text,title_text,rating,length
0,"HP Laptop 15-d035dx ""MINUS 10 Stars!""","""MINUS 10 Stars!""This thing is a TOTAL NIGHTMA...","HP Laptop 15-d035dx ""MINUS 10 Stars!"" ""MINUS 1...",1,2049
1,Very Good Laptop,This is a Very Good Laptop for anyone age 12 -...,Very Good Laptop This is a Very Good Laptop fo...,5,382
2,Wonderful,Great laptop! Thank you so much!! The only p...,Wonderful Great laptop! Thank you so much!! ...,5,132
3,Not Working Properly,The computer has a lot of glitches in it. It'...,Not Working Properly The computer has a lot of...,1,572
4,Good to go!,Good notebook for simple tasks (everything but...,Good to go! Good notebook for simple tasks (ev...,4,147
...,...,...,...,...,...
40757,Amazing!,I bought this laptop for $300 and its a very g...,Amazing! I bought this laptop for $300 and its...,5,2184
40758,Worst touchpad EVER,Not only are the touchpad buttons stiff and di...,Worst touchpad EVER Not only are the touchpad ...,2,391
40759,"DM3 Series are GREAT laptops, BUY ONE!!!","I've had many laptops in my day, and nothing c...","DM3 Series are GREAT laptops, BUY ONE!!! I've ...",5,760
40760,Good business laptop,I bought this computer about 2 years ago. It s...,Good business laptop I bought this computer ab...,4,1097


#### Removes the neutral reviews and labeling the data («positive» / «negative»)
* We are using the **labeling_data()** function created above

In [38]:
reviews_wang = labeling_data(reviews_wang,'title_text', 'rating','length')
reviews_wang

Unnamed: 0,review,label,length
0,"HP Laptop 15-d035dx ""MINUS 10 Stars!"" ""MINUS 1...",negative,2049
1,Very Good Laptop This is a Very Good Laptop fo...,positive,382
2,Wonderful Great laptop! Thank you so much!! ...,positive,132
3,Not Working Properly The computer has a lot of...,negative,572
4,Good to go! Good notebook for simple tasks (ev...,positive,147
...,...,...,...
40757,Amazing! I bought this laptop for $300 and its...,positive,2184
40758,Worst touchpad EVER Not only are the touchpad ...,negative,391
40759,"DM3 Series are GREAT laptops, BUY ONE!!! I've ...",positive,760
40760,Good business laptop I bought this computer ab...,positive,1097


### Exploratory Data Analysis

In [39]:
review_length_histogram(reviews_wang,'label','length','labeled')

### Balancing the data

In [40]:
# This function takes a dataframe with the format returned by the labeling_data() function and 
# balance «positives» and «negatives» reviews. To do so, 
#  * It counts the number of positive
#  * It removes rows from the class that has more reviews (it removes the shorters reviews) so both 
#    classes will have the same number of reviews
# This is done because in many cases the balance between classes is an important factor when building
# a Supervised Model

def balance_pos_neg(data):
    reviews_pos = data[data['label'] == 'positive']
    reviews_pos = reviews_pos.sort_values(by=['length'], ascending=False)

    reviews_neg = data[data['label'] == 'negative']
    reviews_neg = reviews_neg.sort_values(by=['length'], ascending=False)

    len_pos_neg = [len(reviews_pos),len(reviews_neg)]
    min_len = min(len_pos_neg)

    reviews_pos = reviews_pos[0:min_len]
    reviews_neg = reviews_neg[0:min_len]

    reviews_balanced = pd.concat([reviews_pos,reviews_neg],axis=0,ignore_index=False)
    reviews_balanced = reviews_balanced.sort_index()
    
    return reviews_balanced

In [41]:
reviews_wang_balanced = balance_pos_neg(reviews_wang)
reviews_wang_balanced
review_length_histogram(reviews_wang_balanced,'label','length','labeled')

### Building the Naive Bayes Model using the «reviews_wang» dataset

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


# Data spliting
x = reviews_wang_balanced['review']
y = reviews_wang_balanced['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)


# Creating a Naive Bayes classifier Pipeline
nbayes_model_wang = Pipeline([
    ('bow',        CountVectorizer(analyzer=pre_processing)),
    ('tfidf',      TfidfTransformer()),
    ('classifier', MultinomialNB()),
])


# Creating the Naive Bayes classifier
nbayes_model_wang.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function pre_processing at 0x7fa7494449e0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

### Classifying our test data

In [43]:
# Using the model created to classify our test data
y_test_pred  = nbayes_model_wang.predict(x_test)
y_test_pred

array(['negative', 'positive', 'negative', ..., 'negative', 'negative',
       'positive'], dtype='<U8')

### Model evaluation

In [44]:
display(ConfusionMatrixHeatmap(y_test, y_test_pred))
display(confusion_matrix_pie_chart(y_test, y_test_pred))
display(classif_report_heatmap(y_test, y_test_pred))


Classification report:


### Classifying «my_reviews» dataset and evaluating the results
* Here we are using the Model created with the «reviews_wang» dataset to classify «my_reviews» dataset
* Testing a model using another dataset (not a portion of the same dataset reserved as test data) is always a better way to evaluate the performance of a model.

In [None]:
my_reviews_ml = labeling_data(my_reviews,'title_text', 'rating')

x_test = my_reviews_ml['review']
y_test = my_reviews_ml['label']

y_test_pred = nbayes_model_wang.predict(x_test)

display(ConfusionMatrixHeatmap(y_test, y_test_pred))
display(confusion_matrix_pie_chart(y_test, y_test_pred))
display(classif_report_heatmap(y_test, y_test_pred))