BeautifulSoup: Beautiful Soup is a Python package for parsing HTML and XML documents. It creates parse trees that are helpful to extract the data easily.

The requests module allows you to send HTTP requests using Python. The HTTP request returns a Response Object with all the response data (content, encoding, status, etc).

In [None]:
#https://medium.com/voice-tech-podcast/web-scraping-using-python-a89fc1609736


for i in range (1,1000):
    L='https://www.amazon.in/Samsung-Storage-Processor-Purchased-Separately/product-reviews/B09XJ5LD6L/pageNumber=3/ref='+'cm_cr_getr_d_paging_btm_next_'+str(i)+'?pageNumber='+str(i)
    print(L)

In [None]:
from bs4 import BeautifulSoup as bs
import requests
review_content = []

for i in range (1,100):
    #Get the source HTML code as URL
    page = requests.get('https://www.amazon.in/Oppo-Dynamic-Additional-Exchange-CPH2179/product-reviews/B08LRCMWKD/pageNumber=3/ref='+'cm_cr_getr_d_paging_btm_next_'+str(i)+'?pageNumber='+str(i))
    #Convert that text into a bs4 lxml object
    soup = bs(page.content,'html.parser')  
    review = soup.find_all("span",{"data-hook":"review-body"}) #Get the value(s) present in each tag into a list/tuple/dictionary
    for i in range(0,len(review)):
        review_content.append(review[i].get_text())


In [None]:
len(review_content)

In [None]:
review_content[:] = [reviews.lstrip('\n') for reviews in review_content] #lstrip() method returns a copy of the string with leading characters removed (based on the string argument passed). If no argument is passed, it removes leading spaces.
review_content

In [None]:
review_content[:] = [reviews.rstrip('\n') for reviews in review_content] #The rstrip() method returns a copy of the string by removing the trailing characters specified as argument
review_content

In [None]:
import pandas as pd
df = pd.DataFrame()
df['Reviews']=review_content
df

#                             Text Cleaning

In [None]:
# Create a function to clean the Reviews
import re

def cleanTxt(text):
    text=re.sub(r'@[A-Za-z0-9]','',text)  # Removed @ mentions
    text=re.sub(r'#','',text)             # Removing '#' symbole
    text=re.sub(r'RT[\s]+','', text)      # Removing RT
    text=re.sub(r'https?:\/\/S+','',text) # Removing the Hyperlink
    return text
    

In [None]:
# Apply cleanTxt on stored tweets
df['Reviews']=df['Reviews'].apply(cleanTxt)
df['Reviews']

In [None]:
 # loading in all the essentials for data manipulation

import pandas as pd
import numpy as np
#load inthe NTLK stopwords to remove articles, preposition and other words that are not actionable
from nltk.corpus import stopwords
# This allows to create individual objects from a bog of words
from nltk.tokenize import word_tokenize
# Lemmatizer helps to reduce words to the base form
from nltk.stem import WordNetLemmatizer
# Ngrams allows to group words in common pairs or trigrams..etc
from nltk import ngrams
# We can use counter to count the objects
from collections import Counter
# This is our visual library
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import string
sentence=df['Reviews'].to_string()
sentence

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# creates tokens, creates lower class, removes numbers and lemmatizes the words
new_tokens = word_tokenize(sentence)
new_tokens = [t.lower() for t in new_tokens]
new_tokens =[t for t in new_tokens if t not in stopwords.words('english')]
lemmatizer = WordNetLemmatizer()
new_tokens =[lemmatizer.lemmatize(t) for t in new_tokens]
#counts the words, pairs and trigrams

counted = Counter(new_tokens)
counted_2= Counter(ngrams(new_tokens,2))
counted_3= Counter(ngrams(new_tokens,3))
counted_3

In [None]:

#creates 3 data frames and returns thems
word_freq = pd.DataFrame(counted.items(),columns=['word','frequency']).sort_values(by='frequency',ascending=False)
word_pairs =pd.DataFrame(counted_2.items(),columns=['pairs','frequency']).sort_values(by='frequency',ascending=False)
trigrams =pd.DataFrame(counted_3.items(),columns=['trigrams','frequency']).sort_values(by='frequency',ascending=False)


In [None]:
# Create WordCloud
! pip install wordcloud
from wordcloud import WordCloud
wordcloud = WordCloud(max_words = 70, background_color = 'yellow').generate((sentence))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# create subplot of the different data frames and draw graphs

fig, axes = plt.subplots(3,1,figsize=(8,20))
sns.barplot(ax=axes[0],x='frequency',y='word',data=word_freq.head(30))
sns.barplot(ax=axes[1],x='frequency',y='pairs',data=word_pairs.head(30))
sns.barplot(ax=axes[2],x='frequency',y='trigrams',data=trigrams.head(30))

# Sentiment Analysis

In [None]:
!pip install TextBlob
from textblob import TextBlob

# Preparing an input sentence

sentence = 'I am  happy with product, Please throw it'
analysisPol = TextBlob(sentence).polarity  # Polarity ranges from -1 to +1. 
# -1==Negative sentiment  0= neutral sentiment  +1=Positive sentiment
analysisSub = TextBlob(sentence).subjectivity  # 0 (subjectivity) to 1(no subjectivity)

print(analysisPol)
print(analysisSub)

In [None]:
# Create a function to calculate the subjectivity and polarity

def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#Create a function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity
  
 

In [None]:
#Create two new columns ‘Subjectivity’ & ‘Polarity’

df['TextBlob_Subjectivity']=df['Reviews'].apply(getSubjectivity)
df['TextBlob_Polarity']=df['Reviews'].apply(getPolarity)
df

In [None]:
# Create a function to decide the negative and positive sentiment based on polarity score

def getAnalysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

    

In [None]:
df ['Sentiment'] = df['TextBlob_Polarity'].apply(getAnalysis )

df

In [None]:
df['Sentiment'] .value_counts()

In [None]:
# Bar plot of tweets based on sentiment score or polarity
df ['Sentiment'] .value_counts().plot(kind='bar')
df ['Sentiment'] .value_counts()/df ['Sentiment'] .value_counts().sum()