### This is a test exercise to practice and evaluate capabilities of Spacy and Sentiment Analysis

#### Load Libraries

In [9]:
import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS

In [10]:
nlp=spacy.load("en_core_web_sm")

In [11]:
# Sample dummy text
sample_text = "Microsoft is better than Apple.It's perfect weather in Dallas today. However, it has been a really long and hot summer 2022. I wish we have milder winter now. Apple is delicious."

In [12]:
doc = nlp(sample_text)

In [13]:
# Breakdown into individual sentences.
for sentence in doc.sents:
    print(sentence)

Microsoft is better than Apple.
It's perfect weather in Dallas today.
However, it has been a really long and hot summer 2022.
I wish we have milder winter now.
Apple is delicious.


In [14]:
# Breaksown sentences to words
for sentence in doc.sents:
    for word in sentence:
        print(word)

Microsoft
is
better
than
Apple
.
It
's
perfect
weather
in
Dallas
today
.
However
,
it
has
been
a
really
long
and
hot
summer
2022
.
I
wish
we
have
milder
winter
now
.
Apple
is
delicious
.


#### Identify Stop Words

#### Spacy comes with a list of stop words. Its important to learn what those are as blindly removing the stop words can actually lead to meaningless results.

In [15]:
sw=list(STOP_WORDS)

In [16]:
print(sw)

['some', '‘re', 'may', 'only', 'keep', 'this', 'must', 'more', 'becoming', 'him', 'even', 'unless', 'his', 'not', 'how', 'everyone', 'or', 'become', '‘ll', 'nevertheless', 'take', 'up', '‘s', 'above', 'he', 'back', 'few', 'does', 'four', 'whether', 'quite', 'former', 'seems', 'beside', '’ll', 'however', 'now', 'they', 'three', 'moreover', 'in', 'hereafter', 'than', 'indeed', 'put', 'becomes', 'regarding', 'hereupon', 'me', 'such', 'us', 'whereby', 'here', 'per', 'is', 'wherein', 'has', 'six', 'call', 'get', 'anywhere', 'eleven', 'your', 'but', 'was', 'thereby', 'whereafter', 'down', 'noone', 'had', 'i', 'all', 'seem', '’d', 'you', 'do', 'everything', 'each', 'during', 'ourselves', 'neither', 'meanwhile', '’s', 'throughout', 'except', 'somehow', 'any', 'latterly', 'say', 'as', 'no', 'yours', 'their', 'her', '’m', 'until', 'so', 'herein', 'them', 'namely', 'using', 'therein', 'nothing', 'often', 'whereas', 'around', 'most', 'elsewhere', 'nobody', 'enough', 'although', 'it', 'via', 'about

In [17]:
len(sw)

326

In [18]:
# Words after removal of stop words
for word in doc:
    if word.is_stop == False:
        print (word)
    

Microsoft
better
Apple
.
perfect
weather
Dallas
today
.
,
long
hot
summer
2022
.
wish
milder
winter
.
Apple
delicious
.


In [19]:
# Identify part of speech of words
for word in doc:
    print(word.text, word.pos_)

Microsoft PROPN
is AUX
better ADJ
than ADP
Apple PROPN
. PUNCT
It PRON
's AUX
perfect ADJ
weather NOUN
in ADP
Dallas PROPN
today NOUN
. PUNCT
However ADV
, PUNCT
it PRON
has AUX
been AUX
a DET
really ADV
long ADJ
and CCONJ
hot ADJ
summer NOUN
2022 NUM
. PUNCT
I PRON
wish VERB
we PRON
have VERB
milder NOUN
winter NOUN
now ADV
. PUNCT
Apple PROPN
is AUX
delicious ADJ
. PUNCT


#### Spacy has great capabilities to show the dependency and lineage of words in a sentence.

In [21]:

displacy.render(doc, style = 'dep')

#### Named Entity Recognition

In [22]:
displacy.render(doc, style = 'ent')

#### Doesn't do a great job in identifying Apple as Organization vs Fruit. In both sentences it assumes it as Org.

## Sentiment Analysis

### Approach # 1

In [23]:
# Load Libraries
import os
import random
import spacy
from spacy.util import minibatch, compounding
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split 
from sklearn.base import TransformerMixin 
from sklearn.svm import LinearSVC 
from sklearn.pipeline import Pipeline 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [24]:
# Load popular review dataset from YELP, IMDB and AMAZON

df_yelp = pd.read_csv('datasets/yelp_labelled.txt', sep='\t', header = None)
df_imdb = pd.read_csv('datasets/imdb_labelled.txt', sep='\t', header = None)
df_amz = pd.read_csv('datasets/amazon_cells_labelled.txt', sep='\t', header = None)

In [25]:
# Assign Column names
columns_name = ['Text', 'Sentiment']
df_yelp.columns = columns_name
df_imdb.columns = columns_name
df_amz.columns = columns_name

In [26]:
df_yelp

Unnamed: 0,Text,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
...,...,...
995,The screen does get smudged easily because it ...,0
996,What a piece of junk.. I lose more calls on th...,0
997,Item Does Not Match Picture.,0
998,The only thing that disappoint me is the infra...,0


In [27]:
frames = [df_yelp,df_imdb,df_amz]

In [28]:
keys = ['Yelp','IMDB','Amazon']

In [29]:
frames

[                                                  Text  Sentiment
 0    So there is no way for me to plug it in here i...          0
 1                          Good case, Excellent value.          1
 2                               Great for the jawbone.          1
 3    Tied to charger for conversations lasting more...          0
 4                                    The mic is great.          1
 ..                                                 ...        ...
 995  The screen does get smudged easily because it ...          0
 996  What a piece of junk.. I lose more calls on th...          0
 997                       Item Does Not Match Picture.          0
 998  The only thing that disappoint me is the infra...          0
 999  You can not answer calls with the unit, never ...          0
 
 [1000 rows x 2 columns],
                                                   Text  Sentiment
 0    A very, very, very slow-moving, aimless movie ...          0
 1    Not sure who was more lost -

In [30]:
df = pd.concat(frames,keys=keys)

In [31]:
df

Unnamed: 0,Unnamed: 1,Text,Sentiment
Yelp,0,So there is no way for me to plug it in here i...,0
Yelp,1,"Good case, Excellent value.",1
Yelp,2,Great for the jawbone.,1
Yelp,3,Tied to charger for conversations lasting more...,0
Yelp,4,The mic is great.,1
...,...,...,...
Amazon,995,The screen does get smudged easily because it ...,0
Amazon,996,What a piece of junk.. I lose more calls on th...,0
Amazon,997,Item Does Not Match Picture.,0
Amazon,998,The only thing that disappoint me is the infra...,0


In [32]:
nlp=spacy.load("en_core_web_sm")

In [33]:
stopwords = list(STOP_WORDS)

In [34]:
# all tokens without filter

token_list = [token for token in doc]

In [35]:
import string
punct = string.punctuation

In [36]:
# List of punctuations
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [37]:
# Removing stop words, punctuations and lemmatization
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [38]:
# Calculate TF-IDF
tfidf = TfidfVectorizer(tokenizer = text_data_cleaning)
classifier = LinearSVC()

In [39]:
X = df['Text']
y = df['Sentiment']

In [41]:
# Test and Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [42]:
X_train.shape, X_test.shape

((2198,), (550,))

In [43]:
# Building Pipeline
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

In [44]:
clf

In [45]:
clf.fit(X_train, y_train)

In [46]:
# Calculate Y prediction
y_pred = clf.predict(X_test)

In [47]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.90      0.92       266
           1       0.91      0.95      0.93       284

    accuracy                           0.93       550
   macro avg       0.93      0.93      0.93       550
weighted avg       0.93      0.93      0.93       550



In [48]:
# Predict a simple text
clf.predict(['It was not good'])

array([1])

#### `1` is a positive sentiment. `not` is not identified as negative sentiment here. Algorithm is predicting solely on `good`.

In [49]:
clf.predict(['The movie was bad'])

array([0])

#### Its probably because we removed stop words and `not` is a stop word. Using stop word removal can change the meaning of a sentence and make incorrect predictions.

In [40]:
i=0
for stopword in stopwords:
    i+=1
    if stopword == "not":
        print(stopword,i)

not 270


In [41]:
i=0
for stopword in stopwords:
    i+=1
    print(stopword,i)

former 1
may 2
keep 3
him 4
two 5
they 6
seem 7
whereupon 8
except 9
these 10
empty 11
at 12
say 13
amount 14
's 15
beyond 16
become 17
what 18
enough 19
latter 20
had 21
elsewhere 22
her 23
rather 24
n't 25
show 26
’ll 27
afterwards 28
all 29
otherwise 30
until 31
thence 32
name 33
thru 34
somewhere 35
hence 36
throughout 37
others 38
’m 39
front 40
down 41
further 42
such 43
'm 44
about 45
namely 46
whose 47
them 48
whom 49
while 50
though 51
too 52
already 53
done 54
did 55
me 56
she 57
their 58
one 59
hereby 60
than 61
mostly 62
are 63
i 64
his 65
out 66
nothing 67
from 68
where 69
‘ll 70
how 71
first 72
mine 73
could 74
seeming 75
have 76
ca 77
six 78
eleven 79
since 80
if 81
hereupon 82
our 83
thereafter 84
would 85
be 86
into 87
each 88
using 89
of 90
against 91
everything 92
least 93
might 94
nine 95
fifteen 96
twenty 97
whoever 98
thereby 99
before 100
but 101
noone 102
else 103
thereupon 104
been 105
without 106
‘d 107
n‘t 108
're 109
used 110
neither 111
fifty 112
onto 113
m

### Approach # 2

In [50]:
# Lemmatize the words

lemmas = [f"Token: {token}, lemma: {token.lemma_}" for token in token_list]
lemmas

['Token: Microsoft, lemma: Microsoft',
 'Token: is, lemma: be',
 'Token: better, lemma: well',
 'Token: than, lemma: than',
 'Token: Apple, lemma: Apple',
 'Token: ., lemma: .',
 'Token: It, lemma: it',
 "Token: 's, lemma: be",
 'Token: perfect, lemma: perfect',
 'Token: weather, lemma: weather',
 'Token: in, lemma: in',
 'Token: Dallas, lemma: Dallas',
 'Token: today, lemma: today',
 'Token: ., lemma: .',
 'Token: However, lemma: however',
 'Token: ,, lemma: ,',
 'Token: it, lemma: it',
 'Token: has, lemma: have',
 'Token: been, lemma: be',
 'Token: a, lemma: a',
 'Token: really, lemma: really',
 'Token: long, lemma: long',
 'Token: and, lemma: and',
 'Token: hot, lemma: hot',
 'Token: summer, lemma: summer',
 'Token: 2022, lemma: 2022',
 'Token: ., lemma: .',
 'Token: I, lemma: I',
 'Token: wish, lemma: wish',
 'Token: we, lemma: we',
 'Token: have, lemma: have',
 'Token: milder, lemma: milder',
 'Token: winter, lemma: winter',
 'Token: now, lemma: now',
 'Token: ., lemma: .',
 'Toke

##### https://spacytextblob.netlify.app/

### Text Blob can help get the sentiment and subjectivity polarity.

In [66]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

The `._.blob` attribute contains all of the methods and attributes that belong to the textblob.TextBlob class Some of the common methods and attributes include:

`._.blob.polarity:` a float within the range `[-1.0, 1.0]`.

`._.blob.subjectivity:` a float within the range `[0.0, 1.0]` where 0.0 is very objective and 1.0 is very subjective.

`._.blob.sentiment_assessments.assessments:` a list of polarity and subjectivity scores for the assessed tokens.

In [67]:

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')


<spacytextblob.spacytextblob.SpacyTextBlob at 0x7fa14b045460>

In [68]:
text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'


In [69]:
doc = nlp(text)
doc

I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.

In [70]:
doc._.blob.polarity                           

-0.125

In [71]:
doc._.blob.subjectivity                       

0.9

### Above sentence has high Subjectivity and a negative sentiment

In [72]:
doc._.blob.sentiment_assessments.assessments   

[(['really', 'horrible'], -1.0, 1.0, None),
 (['worst', '!'], -1.0, 1.0, None),
 (['really', 'good'], 0.7, 0.6000000000000001, None),
 (['happy'], 0.8, 1.0, None)]

In [73]:
#ngram 
doc._.blob.ngrams()                            

[WordList(['I', 'had', 'a']),
 WordList(['had', 'a', 'really']),
 WordList(['a', 'really', 'horrible']),
 WordList(['really', 'horrible', 'day']),
 WordList(['horrible', 'day', 'It']),
 WordList(['day', 'It', 'was']),
 WordList(['It', 'was', 'the']),
 WordList(['was', 'the', 'worst']),
 WordList(['the', 'worst', 'day']),
 WordList(['worst', 'day', 'ever']),
 WordList(['day', 'ever', 'But']),
 WordList(['ever', 'But', 'every']),
 WordList(['But', 'every', 'now']),
 WordList(['every', 'now', 'and']),
 WordList(['now', 'and', 'then']),
 WordList(['and', 'then', 'I']),
 WordList(['then', 'I', 'have']),
 WordList(['I', 'have', 'a']),
 WordList(['have', 'a', 'really']),
 WordList(['a', 'really', 'good']),
 WordList(['really', 'good', 'day']),
 WordList(['good', 'day', 'that']),
 WordList(['day', 'that', 'makes']),
 WordList(['that', 'makes', 'me']),
 WordList(['makes', 'me', 'happy'])]

In [76]:
# Test sentence

test_text = "It was not good."

In [77]:
doc = nlp(test_text)
doc

It was not good.

In [78]:
doc._.blob.polarity  

-0.35

In [79]:
doc._.blob.subjectivity 

0.6000000000000001

### We accurately get negative polarity for the above test statement.

### Approach # 3

In [80]:
sentences = [
  'The food we had yesterday was delicious',
  'My time in Italy was very enjoyable',
  'I found the meal to be tasty',
  'The internet was slow.',
  'Our experience was suboptimal'
]

In [81]:
for sentence in sentences:
  doc = nlp(sentence)
  for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
      token.pos_,[child for child in token.children])

The det food NOUN DET []
food nsubj was AUX NOUN [The, had]
we nsubj had VERB PRON []
had relcl food NOUN VERB [we, yesterday]
yesterday npadvmod had VERB NOUN []
was ROOT was AUX AUX [food, delicious]
delicious acomp was AUX ADJ []
My poss time NOUN PRON []
time nsubj was AUX NOUN [My, in]
in prep time NOUN ADP [Italy]
Italy pobj in ADP PROPN []
was ROOT was AUX AUX [time, enjoyable]
very advmod enjoyable ADJ ADV []
enjoyable acomp was AUX ADJ [very]
I nsubj found VERB PRON []
found ROOT found VERB VERB [I, be]
the det meal NOUN DET []
meal nsubj be AUX NOUN [the]
to aux be AUX PART []
be ccomp found VERB AUX [meal, to, tasty]
tasty acomp be AUX ADJ []
The det internet NOUN DET []
internet nsubj was AUX NOUN [The]
was ROOT was AUX AUX [internet, slow, .]
slow acomp was AUX ADJ []
. punct was AUX PUNCT []
Our poss experience NOUN PRON []
experience nsubj was AUX NOUN [Our]
was ROOT was AUX AUX [experience, suboptimal]
suboptimal acomp was AUX ADJ []


In [83]:
# Identify the adjective which are the descriptive term
for sentence in sentences:
  doc = nlp(sentence)
  descriptive_term = ''
  for token in doc:
    if token.pos_ == 'ADJ':
      descriptive_term = token
  print(sentence)
  print(descriptive_term)

The food we had yesterday was delicious
delicious
My time in Italy was very enjoyable
enjoyable
I found the meal to be tasty
tasty
The internet was slow.
slow
Our experience was suboptimal
suboptimal


In [84]:
# One step further to identify the adverbs to qualify the sentences sentiments even further
for sentence in sentences:
  doc = nlp(sentence)
  descriptive_term = ''
  for token in doc:
    if token.pos_ == 'ADJ':
      prepend = ''
      for child in token.children:
        if child.pos_ != 'ADV':
          continue
        prepend += child.text + ' '
      descriptive_term = prepend + token.text
  print(sentence)
  print(descriptive_term)

The food we had yesterday was delicious
delicious
My time in Italy was very enjoyable
very enjoyable
I found the meal to be tasty
tasty
The internet was slow.
slow
Our experience was suboptimal
suboptimal


In [85]:
aspects = []
for sentence in sentences:
  doc = nlp(sentence)
  descriptive_term = ''
  target = ''
  for token in doc:
    if token.dep_ == 'nsubj' and token.pos_ == 'NOUN':
      target = token.text
    if token.pos_ == 'ADJ':
      prepend = ''
      for child in token.children:
        if child.pos_ != 'ADV':
          continue
        prepend += child.text + ' '
      descriptive_term = prepend + token.text
  aspects.append({'aspect': target,
    'description': descriptive_term})
print(aspects)

[{'aspect': 'food', 'description': 'delicious'}, {'aspect': 'time', 'description': 'very enjoyable'}, {'aspect': 'meal', 'description': 'tasty'}, {'aspect': 'internet', 'description': 'slow'}, {'aspect': 'experience', 'description': 'suboptimal'}]


In [86]:
from textblob import TextBlob
for aspect in aspects:
  aspect['sentiment'] = TextBlob(aspect['description']).sentiment
print(aspects)

[{'aspect': 'food', 'description': 'delicious', 'sentiment': Sentiment(polarity=1.0, subjectivity=1.0)}, {'aspect': 'time', 'description': 'very enjoyable', 'sentiment': Sentiment(polarity=0.65, subjectivity=0.78)}, {'aspect': 'meal', 'description': 'tasty', 'sentiment': Sentiment(polarity=0.0, subjectivity=0.0)}, {'aspect': 'internet', 'description': 'slow', 'sentiment': Sentiment(polarity=-0.30000000000000004, subjectivity=0.39999999999999997)}, {'aspect': 'experience', 'description': 'suboptimal', 'sentiment': Sentiment(polarity=0.0, subjectivity=0.0)}]


### In above example, algorithm couldn't classify the sentiments of words like `Suboptimal`. we can train it to be better predictive algorithm.

In [96]:
from textblob.classifiers import NaiveBayesClassifier
# We train the NaivesBayesClassifier
train = [
  ('Slow internet.', 'negative'),
  ('Delicious food', 'positive'),
  ('Suboptimal experience', 'negative'),
  ('Very enjoyable time', 'positive'),
  ('delicious food.', 'negative'), 
  ('loved', 'positive'),
  ('not good', 'negative') 
]
cl = NaiveBayesClassifier(train)
# And then we try to classify some sample sentences.
blob = TextBlob("Delicious food. Very Slow internet. Suboptimal experience. Enjoyable food.", classifier=cl)
for s in blob.sentences:
  print(s)
  print(s.classify())

Delicious food.
positive
Very Slow internet.
negative
Suboptimal experience.
negative
Enjoyable food.
negative


In [97]:
blob = TextBlob("It was not good. I loved it.", classifier=cl)
for s in blob.sentences:
  print(s)
  print(s.classify())

It was not good.
negative
I loved it.
positive
