In [None]:
import pandas as pd 

# After extracting the data from Amazon using scrapy, we have stored the data into a simple json text file.
# Importing the data from the json text file into a pandas dataframe:
amazon_data = pd.read_json('/home/adelo/amazon_data.json')

amazon_data[['ASIN','price','average_customer_reviews','number_reviews','number_ratings','tech_details','reviews']]

In [None]:
import pandas as pd


# After extracting the data from Amazon using scrapy, we have stored the data into a simple json text file.
# Importing the data from the json text file into a pandas dataframe:
amazon_data = pd.read_json('/home/adelo/amazon_data.json')


# After loading the data from the json file, every «review» entry is a dictionary type value that is composed of several fields: customer name, rating, date, title, and the text of the review itself.
# Here we extract the relevant details (title and the text of the review itself) and create 3 new 
# columns to facilitate the handling of the «review» entries. We create the following columns: «reviews_title», «reviews_text» and «reviews_one_string»:
reviews_title = []
reviews_text  = []
reviews_one_string = []
for i in range(amazon_data.shape[0]):
    reviews_title_per_item = []
    reviews_text_per_item  = []
    reviews_one_string_per_item = ''
    for j in range(len(amazon_data['reviews'][i])):
        title = amazon_data['reviews'][i][j]['title']
        text  = amazon_data['reviews'][i][j]['review_text']
        reviews_title_per_item.append(title)
        reviews_text_per_item.append(text)
        reviews_one_string_per_item += title+' '+text+' '
    reviews_title.append(reviews_title_per_item)
    reviews_text.append(reviews_text_per_item)
    reviews_one_string.append(reviews_one_string_per_item.rstrip())

# reviews_title is a list that contains all review titles in a review entry:
amazon_data['reviews_title'] = reviews_title
# reviews_text is a list that contains all review comments (without the title)
# in a review entry:
amazon_data['reviews_text']  = reviews_text
# reviews_text is a list that contains all review comments (including the title) in a review entry:
amazon_data['reviews_one_string']  = reviews_one_string


# Here we make sure that the first character of the brand name is uppercase and 
# remaining characters lowercase. This is important because we are going to perform
# filtering and searching function using the brand name so we need to make sure 
# that the writing is consistent.
amazon_data['brand'] = [ amazon_data['tech_details'][i]['Brand Name'].title()  if   amazon_data
['tech_details'][i]['Brand Name'] not in ['HP','hp','Hp']  else    amazon_data['tech_details'][i]['Brand Name'].upper()  for  i  in range(amazon_data.shape[0]) ]


# After loading the data from the json file, all technical details are in a dictionary type entry.
# In the following block we are extracting the tech details that are important for our analysis («series» and «model_number») and creating new columns for each of these relevant tech details
# Series:
amazon_data['series'] = [ amazon_data['tech_details'][i]['Series']  for  i  in  range(amazon_data.shape[0]) ]
# Model number:
amazon_data['model_number'] = [ amazon_data['tech_details'][i]['Item model number']  for  i  in  range(amazon_data.shape[0]) ]


# After extracting the data from the web page, the numeric values ("average_customer_reviews" and "price") are actually of «string» type. So, We need to convert the entry to a numeric type (Float). This is necessary because we will perform mathematical operations with these values:

# The following function takes a numeric string (<class 'str'>), removes any comma or dollar characters ("," "$") and
# returns a numeric float value (<class 'float'>):
def format_cleaner(entry):
    return float(entry.replace(',','').replace('$',''))

# A raw «average_customer_reviews» entry looks like this: "4.5 out of 5 stars"  (<class 'str'>)
# We only need the firs value as a numeric float type: 4.5  (<class 'float'>)
# This is done in the next line of code over the entire dataframe by selecting only the 
# firs element ("4.5" in the above example) and applying the «format_cleaner()» function to the «average\_customer\_reviews» column:
amazon_data['average_customer_reviews'] = [ format_cleaner(val[0]) for val in amazon_data['average_customer_reviews'].str.split() ]

# A raw «price» entry looks like this: "$689.90"  (<class 'str'>)
# We only need the numeric value: 689.90  (<class 'float'>)
# This is done in the next line of code over the entire dataframe by applying the «format_cleaner()» function to the «price» column:
amazon_data['price'] = amazon_data['price'].apply(lambda val: round(format_cleaner(val)) if pd.notnull(val) else val)




In [None]:
# Seleccionando (en orden de importancia) las columnas más relevantes for our analysis:
cols = ['brand','series','model_number','price','average_customer_reviews','number_reviews','number_ratings','reviews_title','reviews_text','reviews_one_string','url']
amazon_data = amazon_data[cols]

In [None]:
import nltk
import string
nltk.data.path.append('/home/adelo/.nltk/nltk_data')
from nltk.corpus import stopwords

# Removing punctuation and stopwords:
# * Punctuation: We will remove all punctuation char found the «string» library.
# * Our stopwords will be composed by:
#   - The common stopwords defined in the nltk library 
#   - Some particular stopwords related to our data:
#     * Brand names: There is no point in analyzing brand names. For instance, in a Lenovo review, the customer will use the word ``Lenovo'' many times, but this fact does not contribute anything to the analysis. 
#     * Laptop synonyms: laptop, computer, machine, etc.
#     * Some no-official contractions that are not in the nltk library: Im dont Ive, etc.

# Defining our stopwords list:
import nltk
import string
nltk.data.path.append('/home/adelo/.nltk/nltk_data')
from nltk.corpus import stopwords

stopwords_brands = [ b.lower() for b in list(set(amazon_data['brand'])) ]
stopwords_brands_additionals = ['computer','computers','laptop','laptops','thing','things','machine','machines','im','dont','ive']
stopwords_total  = stopwords.words('english') + stopwords_brands + stopwords_brands_additionals

# The following function takes a string and returns the same string without punctuation or stopwords:
def pre_processing(texto):
    # Removing punctuation:
    nopunct = ''.join([ char for char in texto if char not in string.punctuation ])
    # Removing Stopwords:
    return ' '.join([ word for word in nopunct.split() if word.lower() not in stopwords_total ])

# Example of applying the function «pre_processing()»:
frase = "Here! A sentence: It has $SOME punctuation and stopwords..."
frase_clean = pre_processing(frase)
frase_clean

# Here we are applying the function «pre_processing()» to the «reviews_one_string» column over the entire dataframe:
amazon_data['reviews_one_string'] = amazon_data['reviews_one_string'].apply(pre_processing)

In [None]:
display(amazon_data)

In [None]:
amazon_data.to_json(r'./data.json')

In [None]:
import pandas as pd

data = pd.read_json('./data.json')
# data

In [None]:
from textblob import TextBlob
TextBlob('I am great but the impotant thing is that I am not undarstanding what I have to do but I am not loving it ok and it is really bad bad bad bad bad bad bad bad bad bad lov').sentiment

TextBlob(data['reviews_one_string'].loc[1]).sentiment

In [None]:
data