# Data preparation & Text Pre-processing

## Loading the data

In [None]:
import pandas as pd 

# After extracting the data from Amazon using scrapy, we have stored the data into a simple json text file.
# Importing the data from the json text file into a pandas dataframe:
amazon_data = pd.read_json('/home/adelo/1-system/1-disco_local/1-mis_archivos/1-pe/1-ciencia/1-computer_science_an_IT/2-data_science/1-Amazon_Laptops_Dashboard/amazon_data.json')

# my_amazon_data[['ASIN','price','average_customer_reviews','number_reviews','number_ratings','tech_details','reviews']]
amazon_data

## Formatting 

In [None]:
# The following function takes a numeric string (<class 'str'>), removes any comma or dollar characters ("," "$") and
# returns a numeric float value (<class 'float'>):
def format_cleaner(val):
    return float(val.replace(',','').replace('$',''))

# After loading the data from the json file, every «review» entry is a dictionary type value that is composed of several fields: customer name, rating, date, title, and the text of the review itself.
# Here we extract the relevant details (title and the text of the review itself) and create 3 new 
# columns to facilitate the handling of the «review» entries. We create the following columns: «review_title», «review_text» and «review_one_string»:
number_of_reviews = []
review_title      = []
review_text       = []
review_title_text = []
review_one_string = []
review_rating     = []

for i in range(amazon_data.shape[0]):
    review_title_per_item      = []
    review_text_per_item       = []
    review_title_text_per_item = []
    review_one_string_per_item = ''
    review_rating_per_item     = []
    for j in range(len(amazon_data['reviews'][i])):
        title  = amazon_data['reviews'][i][j]['title']
        text   = amazon_data['reviews'][i][j]['review_text']
        title_text = title+' '+text

        rating = amazon_data['reviews'][i][j]['rating']
        rating = rating.split()
        rating = format_cleaner(rating[0])

        review_title_per_item.append(title)
        review_text_per_item.append(text)

        review_title_text_per_item.append(title_text)

        review_one_string_per_item += title+' '+text+' '
        review_rating_per_item.append(rating)

    number_of_reviews.append(j+1)
    review_title.append(review_title_per_item)
    review_text.append(review_text_per_item)

    review_title_text.append(review_title_text_per_item)

    review_one_string.append(review_one_string_per_item.rstrip())
    review_rating.append(review_rating_per_item)

amazon_data['number_of_reviews'] = number_of_reviews
amazon_data['review_title']      = review_title
amazon_data['review_text']       = review_text
amazon_data['review_title_text'] = review_title_text
amazon_data['review_one_string'] = review_one_string
amazon_data['review_rating']     = review_rating


# Here we make sure that the first character of the brand name is uppercase and 
# remaining characters lowercase. This is important because we are going to perform
# filtering and searching function using the brand name so we need to make sure 
# that the writing is consistent.
amazon_data['brand'] = [ amazon_data['tech_details'][i]['Brand Name'].title()  if   amazon_data['tech_details'][i]['Brand Name'] not in ['HP','hp','Hp']  else  amazon_data['tech_details'][i]['Brand Name'].upper()  for  i  in range(amazon_data.shape[0]) ]

# After loading the data from the json file, all technical details are in a dictionary type entry.
# In the following block we are extracting the tech details that are important for our analysis («series» and «model_number») and creating new columns for each of these relevant tech details
# Series:
amazon_data['series'] = [ amazon_data['tech_details'][i]['Series']  for  i  in range(amazon_data.shape[0]) ]
# Model number:
amazon_data['model_number'] = [ amazon_data['tech_details'][i]['Item model number']   for  i  in range(amazon_data.shape[0]) ]

# After extracting the data from the web page, the numeric values ("average_customer_reviews" and "price") are actually of «string» type. So, We need to convert the entry to a numeric type (Float). This is necessary because we will perform mathematical operations with these values:

# A raw «average_customer_reviews» entry looks like this: "4.5 out of 5 stars"  (<class 'str'>)
# We only need the firs value as a numeric float type: 4.5  (<class 'float'>)
# This is done in the next line of code over the entire dataframe by selecting only the 
# firs element ("4.5" in the above example) and applying the «format_cleaner()» function to the «average\_customer\_reviews» column:
amazon_data['average_customer_reviews'] = [ format_cleaner(val[0]) for val in amazon_data['average_customer_reviews'].str.split() ]

# A raw «price» entry looks like this: "$689.90"  (<class 'str'>)
# We only need the numeric value: 689.90  (<class 'float'>)
# This is done in the next line of code over the entire dataframe by applying the «format_cleaner()» function to the «price» column:
amazon_data['price'] = amazon_data['price'].apply(lambda val: round(format_cleaner(val)) if pd.notnull(val) else val)
amazon_data

## Text Pre-processing
This function allows performing the pre-processing directly over the «amazon_data» dataframe. Remembar that, in this dataframe, the reviews for a particular series are in a list that is inside the dataframe. The «pre-processing()» funtion included in «data_analysis2.ipynb» is, on the other hand, disigned to performe the pre-processing over the «reviews» dataframe

* **Removing punctuation:**
 * Punctuation: We will remove all punctuation char found the «string» library.
 
* **Removing stopwords:**
 * Our stopwords will be composed by:
  - The common stopwords defined in the nltk library 
  - Some particular stopwords related to our data:
    * Brand names: There is no point in analyzing brand names. For instance, in a Lenovo review, the customer will use the word ``Lenovo'' many times, but this fact does not contribute anything to the analysis. 
    * Laptop synonyms: laptop, computer, machine, etc.
    * Some no-official contractions that are not in the nltk library: Im dont Ive, etc.

In [None]:
import nltk
import string
nltk.data.path.append('/home/adelo/.nltk/nltk_data')
from nltk.corpus import stopwords

# Defining our stopwords list:
stopwords_brands = [ b.lower() for b in list(set(amazon_data['brand'])) ]
stopwords_brands_additionals = ['computer','computers','laptop','laptops','thing','things','machine','machines','im','dont','ive']
stopwords_total  = stopwords.words('english') + stopwords_brands + stopwords_brands_additionals

# The following function takes a string and returns the same string without punctuation or stopwords:
def pre_processing(texto):
    # Removing punctuation:
    nopunct = ''.join([ char for char in texto if char not in string.punctuation ])
    # Removing Stopwords:
    return ' '.join([ word for word in nopunct.split() if word.lower() not in stopwords_total ])

# The following function takes a list of strings and remove punctuation and stopwords from each string in the list
def pre_processing_lista(lista):
    return [pre_processing(texto) for texto in lista]

# Here is how we would apply the function «pre_processing()» to a column over the entire dataframe.
# However, we won't do that in this stage because we need a raw text for the Sentiment Analysis
# amazon_data['review_title']      = amazon_data['review_title'].apply(pre_processing_lista)
# amazon_data['review_text']       = amazon_data['review_text'].apply(pre_processing_lista)
# amazon_data['review_title_text'] = amazon_data['review_title_text'].apply(pre_processing_lista)
amazon_data['review_one_string'] = amazon_data['review_one_string'].apply(pre_processing)

amazon_data

# Lexicon-based Sentiment Analysis
* We are performing a Lexicon-based Sentiment Analysis using two popular Python libraries: **TextBlob** and **Vader Sentiment**.

## A first example using TextBlob and Vader Sentiment

In [None]:
# Sentiment Analysis with TextBlod
# ================================

# https://textblob.readthedocs.io/en/dev/quickstart.html#sentiment-analysis

# The polarity score is a float within the range [-1.0, 1.0].
# The subjectivity is a float within the range   [ 0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.

from textblob import TextBlob

# Example
TextBlob('I am great but the impotant thing is that I am not undarstanding what I have to do but I am not loving it ok and it is really bad bad bad bad bad bad bad bad bad bad lov').sentiment
TextBlob(amazon_data['reviews_one_string'].loc[1]).sentiment
# We can create a TextBlob object
lista = ['hola','como','estas']
analis = TextBlob(lista)
analisis = TextBlob("TextBlob sure looks like it has some interesting features")
# TextBlob is not only a sentiment analysis library. Check all the methods a TextBlob object has
# print(dir(analisis))
# For example, we can use it for translation:
# print(analisis.translate(to='es'))
# We can also do Part-of-speech tagging
# nltk.data.path.append('/home/adelo/.nltk/nltk_data')
print(analisis.tags)
# Now let's do our sentiment analysis
print(analisis.sentiment)



# Sentiment Analysis with Vader Sentiment
# =======================================

# https://medium.com/analytics-vidhya/simplifying-social-media-sentiment-analysis-using-vader-in-python-f9e6ec6fc52f
# https://pypi.org/project/vaderSentiment/
# https://github.com/cjhutto/vaderSentiment
# https://www.researchgate.net/profile/Cj_Hutto/publication/275828927_VADER_A_Parsimonious_Rule-based_Model_for_Sentiment_Analysis_of_Social_Media_Text/links/554775be0cf26a7bf4d90840/VADER-A-Parsimonious-Rule-based-Model-for-Sentiment-Analysis-of-Social-Media-Text.pdf

# Polarity_scores:
    # The «Compound score» is a metric that calculates the sum of all the lexicon ratings which have been normalized between:
    # [-1, 1]    -1 (most extreme negative) and +1 (most extreme positive).
    #
    # The «Positive», «Negative» and «Neutral» scores represent the proportion of text that falls in these categories.
    # This means our sentence was rated as 67% Positive, 33% Neutral and 0% Negative. Hence all these should add up to 1.

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

# Example
pola_scores = analyser.polarity_scores('I love to eat but sometimes it is better not to eat so much, love love love bad')
# display(pola_scores)
# pola_scores['compound']
frase = 'I love to eat but sometimes it is better not to eat so much, love love love bad'
analyser.polarity_scores(frase)['compound']

## Performing the Sentiment Analysis over the entire dataframe

In [None]:
from textblob import TextBlob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

# This is the sentiment polarity and subjectivity calculated over all the customer reviews for a particular series
amazon_data['avg_polarity_textblob']     = [ round(TextBlob(review).sentiment.polarity, 5)     for review in amazon_data['review_one_string'] ]
amazon_data['avg_subjectivity_textblob'] = [ round(TextBlob(review).sentiment.subjectivity, 5) for review in amazon_data['review_one_string'] ]
amazon_data['avg_polarity_vader']        = [ analyser.polarity_scores(review)['compound']      for review in amazon_data['review_one_string'] ]


# «polarity_title_textblob» and «subjectivity_title_textblob» are «lists» that contain the sentiment polarity and subjectivity, respectively, for each customer review title
polarity_title_textblob = []
subjectivity_title_textblob = []
polarity_title_vader = []
for review_title_list in amazon_data['review_title']:
    polarity_textblob = []
    subjectivity_textblob = []
    polarity_vader = []
    for review_title in review_title_list:
        polarity_textblob.append(round(TextBlob(review_title).sentiment.polarity, 5))
        subjectivity_textblob.append(round(TextBlob(review_title).sentiment.subjectivity, 5))
        polarity_vader.append(analyser.polarity_scores(review_title)['compound'])
    polarity_title_textblob.append(polarity_textblob)
    subjectivity_title_textblob.append(subjectivity_textblob)
    polarity_title_vader.append(polarity_vader)
amazon_data['polarity_title_textblob'] = polarity_title_textblob
amazon_data['subjectivity_title_textblob'] = subjectivity_title_textblob
amazon_data['polarity_title_vader'] = polarity_title_vader


# «polarity_text_textblob» and «subjectivity_text_textblob» are «lists» that contain the sentiment polarity and subjectivity, respectively, for each customer review (all the text of the customer review without the title)
polarity_text_textblob = []
subjectivity_text_textblob = []
polarity_text_vader = []
for review_text_list in amazon_data['review_text']:
    polarity_textblob = []
    subjectivity_textblob = []
    polarity_vader = []
    for review_text in review_text_list:
        polarity_textblob.append(round(TextBlob(review_text).sentiment.polarity, 5))
        subjectivity_textblob.append(round(TextBlob(review_text).sentiment.subjectivity, 5))
        polarity_vader.append(analyser.polarity_scores(review_text)['compound'])
    polarity_text_textblob.append(polarity_textblob)
    subjectivity_text_textblob.append(subjectivity_textblob)
    polarity_text_vader.append(polarity_vader)
amazon_data['polarity_text_textblob'] = polarity_text_textblob
amazon_data['subjectivity_text_textblob'] = subjectivity_text_textblob
amazon_data['polarity_text_vader'] = polarity_text_vader


# «polarity_title_textblob» and «subjectivity_title_textblob» are «lists» that contain the sentiment polarity and 
# subjectivity, respectively, for each customer review title
polarity_title_text_textblob = []
subjectivity_title_text_textblob = []
polarity_title_text_vader = []
length_title_text = []
for review_title_text_list in amazon_data['review_title_text']:
    polarity_textblob = []
    subjectivity_textblob = []
    polarity_vader = []
    length_title_text_value = []
    for review_title_text in review_title_text_list:
        polarity_textblob.append(round(TextBlob(review_title_text).sentiment.polarity, 5))
        subjectivity_textblob.append(round(TextBlob(review_title_text).sentiment.subjectivity, 5))
        polarity_vader.append(analyser.polarity_scores(review_title_text)['compound'])
        length_title_text_value.append(len(review_title_text))
    polarity_title_text_textblob.append(polarity_textblob)
    subjectivity_title_text_textblob.append(subjectivity_textblob)
    polarity_title_text_vader.append(polarity_vader)
    length_title_text.append(length_title_text_value)

amazon_data['polarity_title_text_textblob'] = polarity_title_text_textblob
amazon_data['subjectivity_title_text_textblob'] = subjectivity_title_text_textblob
amazon_data['polarity_title_text_vader'] = polarity_title_text_vader
amazon_data['length_title_text'] = length_title_text
amazon_data

# Emotion analysis using the NRC Lexicon

In [None]:
import numpy as np
import pandas as pd

# Loading the NRC-Sentiment-Emotion-Lexicons
filepath = ('NRC-Sentiment-Emotion-Lexicons/'
            'NRC-Emotion-Lexicon-v0.92/'
            'NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')  
lexiEmo_df0  = pd.read_csv(filepath,
                        names=["word", "emotion", "association"],
                        sep='\t')

lexiEmo_df  = lexiEmo_df0.pivot(index='word',
                                columns='emotion',
                                values='association').reset_index()


# This function returns a list that contains the emotions count in a text
def getEmotionsDf(text, lexiEmo_df):
    counterEmo = pd.Series(data=np.zeros(11).astype(int),index=lexiEmo_df.columns)
    counterEmo.drop(index=['word'],inplace=True)

    text_list = [word for word in text.split()]

    for palabra in text_list:
        if palabra in lexiEmo_df['word'].tolist():
            i = lexiEmo_df.index[lexiEmo_df['word'] == palabra].tolist()
            vectorEmo = lexiEmo_df.iloc[i[0]]
            vectorEmo.drop(index=['word'],inplace=True)
            counterEmo = counterEmo + vectorEmo

    # If we want returning a dataframe:
    # counterEmo_df = pd.DataFrame(counterEmo)
    # counterEmo_df = counterEmo_df.rename(columns={0:'count'})
    # counterEmo_df = counterEmo_df.reset_index()
    # counterEmo_df = counterEmo_df.sort_values(by=['count'],ascending=True)
    
    # return counterEmo_df
    return counterEmo.tolist()

## Performing the Emotion analysis over the entire data

In [None]:
emotions = []
for review_title_text_list in amazon_data['review_title_text']:
    emotions_reviews = []
    for review_title_text in review_title_text_list:
        emotions_reviews.append(getEmotionsDf(review_title_text,lexiEmo_df))
    emotions.append(emotions_reviews)

amazon_data['emotions'] = emotions
amazon_data

# Saving the data in the disc

In [None]:
# Selecting (in orden of importance) the most relevant columns for our analysis:
cols = ['brand',
        'series',
        'model_number',
        'price',
        'number_of_reviews',
        'review_title',
        'review_text',
        'review_title_text',
        'review_one_string',
        'average_customer_reviews',
        'review_rating',
        'avg_polarity_textblob',
        'avg_subjectivity_textblob',
        'avg_polarity_vader',
        'polarity_title_textblob',
        'subjectivity_title_textblob',
        'polarity_title_vader',
        'polarity_text_textblob',
        'subjectivity_text_textblob',
        'polarity_text_vader',
        'polarity_title_text_textblob',
        'subjectivity_title_text_textblob',
        'polarity_title_text_vader',
        'emotions',
        'length_title_text']

amazon_data = amazon_data[cols]
display(amazon_data.number_of_reviews.sum())
display(amazon_data)

In [None]:
amazon_data.to_json(r'./data.json')

# Loading the prepared and processed data

In [None]:
import pandas as pd

amazon_data = pd.read_json('./amazon_data.json', precise_float=True)
amazon_data

# Preparing a reviews dataset

In [None]:
# https://stackoverflow.com/questions/38895856/python-pandas-how-to-compile-all-lists-in-a-column-into-one-unique-list

# This function returns a list
def concaLists(serie_entry):
        return [value for lista in serie_entry.tolist() for value in lista]

brand, series, price  = [], [], []
for  n  in  range(len(amazon_data)):
    brand  += [amazon_data['brand'].iloc[n]]  * amazon_data['number_of_reviews'].iloc[n] 
    series += [amazon_data['series'].iloc[n]] * amazon_data['number_of_reviews'].iloc[n] 
    price  += [amazon_data['price'].iloc[n]]  * amazon_data['number_of_reviews'].iloc[n] 

# This function return a dataset with only review details with sentiment analysis
def createSentimentDf(data):
    return   pd.DataFrame({'title'                            : concaLists(data['review_title']),
                           'text'                             : concaLists(data['review_text']),
                           'title_text'                       : concaLists(data['review_title_text']),
                           'rating'                           : concaLists(data['review_rating']),
                           'polarity_title_textblob'          : concaLists(data['polarity_title_textblob']),
                           'subjectivity_title_textblob'      : concaLists(data['subjectivity_title_textblob']),
                           'polarity_title_vader'             : concaLists(data['polarity_title_vader']),
                           'polarity_text_textblob'           : concaLists(data['polarity_text_textblob']),
                           'subjectivity_text_textblob'       : concaLists(data['subjectivity_text_textblob']),
                           'polarity_text_vader'              : concaLists(data['polarity_text_vader']),
                           'polarity_title_text_textblob'     : concaLists(data['polarity_title_text_textblob']),
                           'subjectivity_title_text_textblob' : concaLists(data['subjectivity_title_text_textblob']),
                           'polarity_title_text_vader'        : concaLists(data['polarity_title_text_vader']),
                           'emotions'                         : concaLists(data['emotions']),
                           'length_title_text'                : concaLists(data['length_title_text']),
                           'brand'                            : brand,
                           'series'                           : series,
                           'price'                            : price
                           })

my_reviews = createSentimentDf(amazon_data)
my_reviews

# Testing

In [None]:
display(amazon_data.groupby('brand').describe()['price']['mean'])
display(amazon_data.groupby('brand').describe()['average_customer_reviews']['mean'])
amazon_data.query('series in ["Aspire E series","L340 Gaming"]')[['brand','series','price','average_customer_reviews']]

In [None]:
amazon_data[amazon_data['brand'] == 'Asus'][['brand','price','average_customer_reviews']]

In [None]:
reviews_flex14 = ' '.join(amazon_data[amazon_data['series'] == 'Flex 14']['review_one_string'])
theWords=['use','good','screen']
for theWord in theWords:
    count=0
    for word in reviews_flex14.split():
        if word == theWord:
            count+=1
    print(count)

In [23]:
acer_pos_reviews = my_reviews[(my_reviews['brand'] == 'Acer') & (my_reviews['rating'] > 3)]
acer_neg_reviews = my_reviews[(my_reviews['brand'] == 'Acer') & (my_reviews['rating'] < 3)]
print(len(acer_pos_reviews), len(acer_neg_reviews))

992 260


In [25]:
acer_pos_reviews.describe()['length_title_text']

count      992.000000
mean       925.718750
std       1121.461209
min         14.000000
25%        285.000000
50%        627.500000
75%       1141.000000
max      10490.000000
Name: length_title_text, dtype: float64