In [1]:
# for Importing the Dataset
import pandas as pd

In [2]:
# lets read the dataset
data = pd.read_csv('amazon_alexa.tsv', delimiter = '\t')

# lets check the shape of the dataset
data.shape

(3150, 5)

In [3]:
# lets check the head of the dataset
data.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [4]:
# lets check if the dataset has any Missing Values
data.isnull().sum()

rating              0
date                0
variation           0
verified_reviews    0
feedback            0
dtype: int64

In [5]:
# lets check the Descriptive Summary of the Dataset
data.describe()

Unnamed: 0,rating,feedback
count,3150.0,3150.0
mean,4.463175,0.918413
std,1.068506,0.273778
min,1.0,0.0
25%,4.0,1.0
50%,5.0,1.0
75%,5.0,1.0
max,5.0,1.0


In [6]:
# lets check the summary of Date, Variation and Reviews
data.describe(include = 'object')

Unnamed: 0,date,variation,verified_reviews
count,3150,3150,3150.0
unique,77,16,2301.0
top,30-Jul-18,Black Dot,
freq,1603,516,79.0


In [7]:
# lets check the Value Counts for Variation 
# data['variation'].value_counts()bb

In [8]:
# Calculate the length of review
data['length'] = data['verified_reviews'].apply(len)
# data['length']bb

In [9]:
from textblob import TextBlob
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/bidhya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
# Calculate the Polarity of the review
def get_polarity(text):
    textblob = TextBlob(str(text.encode('utf-8')))
    pol = textblob.sentiment.polarity
    return pol

In [11]:
# Let's apply the function
data['polarity'] = data['verified_reviews'].apply(get_polarity)
data['polarity']

0       0.625000
1       0.875000
2      -0.100000
3       0.350000
4       0.000000
          ...   
3145    1.000000
3146    0.333333
3147    0.237662
3148    0.316667
3149    0.700000
Name: polarity, Length: 3150, dtype: float64

In [12]:
# Lets calculate the subjectivity of the reviews
def get_subjectivity(text):
    textblob = TextBlob(str(text.encode('utf-8')))
    subj = textblob.sentiment.subjectivity
    return subj

data['subjectivity'] = data['verified_reviews'].apply(get_subjectivity)

In [13]:
data[['length','polarity','subjectivity']].describe()

Unnamed: 0,length,polarity,subjectivity
count,3150.0,3150.0,3150.0
mean,132.049524,0.349792,0.528922
std,182.099952,0.303362,0.256324
min,1.0,-1.0,0.0
25%,30.0,0.123852,0.419196
50%,74.0,0.35,0.585
75%,165.0,0.533333,0.695486
max,2851.0,1.0,1.0


In [14]:
# Calculating char-count
data['char-count'] = data['verified_reviews'].apply(len)

In [15]:
# Calculating word count
data['word-count'] = data['verified_reviews'].apply(lambda x: len(x.split()))

In [16]:
# Calculating word density
data['word-density'] = data['char-count'] / (data['word-count']+1)
data['word-density']


0       3.250000
1       3.000000
2       5.000000
3       4.914286
4       2.500000
          ...   
3145    5.555556
3146    5.625000
3147    5.250000
3148    4.935065
3149    2.000000
Name: word-density, Length: 3150, dtype: float64

In [17]:
# importing the list of punctuation 
import string
punctuation = string.punctuation

# calculating punctuation count
data['punctuation_count'] = data['verified_reviews'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation)))

In [18]:
# Summarize the newly created feature
data[['char-count','word-count','word-density','punctuation_count']].describe()

Unnamed: 0,char-count,word-count,word-density,punctuation_count
count,3150.0,3150.0,3150.0,3150.0
mean,132.049524,25.293016,4.605345,3.887937
std,182.099952,34.584971,1.134737,5.762348
min,1.0,0.0,0.5,0.0
25%,30.0,6.0,4.269231,1.0
50%,74.0,14.0,4.805665,2.0
75%,165.0,32.0,5.207974,5.0
max,2851.0,526.0,32.5,121.0


In [19]:
# Lets create a part of speech dictionary
pos_dic = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron': ['PRP','PRP$','WP','WP$'],
    'verb': ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj': ['JJ','JJR','JJS'],
    'adv': ['RB','RBR','RBS','WRB']
}

In [20]:
# function to check and get the part of speech tag count of a words in a given sentence\
def pos_check(x,flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_dic[flag]:
                cnt += 1
                
    except:
        pass
    return cnt

In [24]:
# Lets calculate the count of Nouns in text
data['noun_count'] = data['verified_reviews'].apply(lambda x: pos_check(x,'noun'))
data['noun_count']

0        1
1        0
2        5
3       11
4        0
        ..
3145     4
3146     5
3147    22
3148    16
3149     0
Name: noun_count, Length: 3150, dtype: int64

In [22]:
# Lets calculate the count of Nouns in text
data['verb_count'] = data['verified_reviews'].apply(lambda x: pos_check(x,'verb'))

In [25]:
data[['noun_count','verb_count']].describe()

Unnamed: 0,noun_count,verb_count
count,3150.0,3150.0
mean,5.945397,5.155873
std,8.222776,7.223565
min,0.0,0.0
25%,1.0,1.0
50%,3.0,3.0
75%,7.0,7.0
max,137.0,102.0
