# Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy

In [2]:
#load dataset
df = pd.read_csv(r"D:\AI\Natural_language_Processing\sentimentdata.csv")
df.shape

(20, 2)

In [3]:
df.head()

Unnamed: 0,document,label
0,Pizza is great and I love pizza.,Positive
1,I hate burger and its bad to eat burger.,Negative
2,I hate dirty tables.,Negative
3,Burger is amazing and I love it more than anyt...,Positive
4,My boss is a monster and I hate him,Negative


In [4]:
df

Unnamed: 0,document,label
0,Pizza is great and I love pizza.,Positive
1,I hate burger and its bad to eat burger.,Negative
2,I hate dirty tables.,Negative
3,Burger is amazing and I love it more than anyt...,Positive
4,My boss is a monster and I hate him,Negative
5,The food was delivered late and I hate late de...,Negative
6,My wife love pizza and burger more than me,Positive
7,the table was bad and dirty and i hate this,Negative
8,Food was delicious and I love it,Positive
9,It great to have good food at good time,Positive


In [5]:
x = df.document
y = df.label

## Text Cleaning

In [6]:
import spacy
# python -m spacy download en_core_web_sm

In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
cleanx = []
for doc in x:
    doc = nlp(doc)
    cleanx.append(" ".join([w.lemma_ for w in doc]))

In [9]:
cleanx

['pizza be great and I love pizza .',
 'I hate burger and its bad to eat burger .',
 'I hate dirty table .',
 'Burger be amazing and I love it more than anything',
 'my boss be a monster and I hate he',
 'the food be deliver late and I hate late delivery',
 'my wife love pizza and burger more than I',
 'the table be bad and dirty and I hate this',
 'food be delicious and I love it',
 'it great to have good food at good time',
 'icecream be not good .',
 'icecream be not bad .',
 'Pizza be not bad .',
 'Burger be bad .',
 'juice be not good .',
 'juice be good .',
 'ketchup be bad .',
 'ketchup be good .',
 'ketchup with pizza be great and good I love it',
 'I hate ketchup with pizza and it be bad   and it be not good']

In [10]:
y

0     Positive
1     Negative
2     Negative
3     Positive
4     Negative
5     Negative
6     Positive
7     Negative
8     Positive
9     Positive
10    Negative
11    Positive
12    Positive
13    Negative
14    Negative
15    Positive
16    Negative
17    Positive
18    Positive
19    Negative
Name: label, dtype: object

## Count Vectorization

In [11]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
dic = list(ENGLISH_STOP_WORDS)
dic.remove('not')

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(lowercase=True,stop_words=dic,ngram_range=(1,2),min_df=2)
vec.fit(cleanx)
print(len(vec.get_feature_names()))
print(vec.get_feature_names())

22
['bad', 'burger', 'burger bad', 'dirty', 'food', 'good', 'great', 'great good', 'hate', 'icecream', 'icecream not', 'juice', 'ketchup', 'ketchup pizza', 'love', 'love pizza', 'not', 'not bad', 'not good', 'pizza', 'pizza great', 'table']


In [13]:
x2 = vec.transform(cleanx).toarray()
x2.shape

(20, 22)

In [14]:
pd.DataFrame(x2,columns=vec.get_feature_names()).head()

Unnamed: 0,bad,burger,burger bad,dirty,food,good,great,great good,hate,icecream,...,ketchup,ketchup pizza,love,love pizza,not,not bad,not good,pizza,pizza great,table
0,0,0,0,0,0,0,1,0,0,0,...,0,0,1,1,0,0,0,2,1,0
1,1,2,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
x2

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 2, 1, 0],
       [1, 2, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0],
       [1, 1, 1, 0, 0, 0,

## Apply ML to developed classification model

In [16]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(15,7),max_iter=1000,tol=0.0000001)
model.fit(x2,y)



MLPClassifier(hidden_layer_sizes=(15, 7), max_iter=1000, tol=1e-07)

In [17]:
newdoc = ["I love pasta and Pasta is amazing"]
newdoc = vec.transform(newdoc)
model.predict(newdoc)

array(['Positive'], dtype='<U8')

In [18]:
newdoc = ["I hate pasta and Pasta is bad"]
newdoc = vec.transform(newdoc)
model.predict(newdoc)

array(['Negative'], dtype='<U8')

In [19]:
newdoc = ["hi siri"]
newdoc = vec.transform(newdoc)
model.predict(newdoc)

array(['Positive'], dtype='<U8')

In [20]:
newdoc.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [21]:
newdoc = ["i love pasta and pasta is amazing"]
newdoc = vec.transform(newdoc)
model.predict_proba(newdoc)

array([[0.03166768, 0.96833232]])

In [22]:
newdoc = ["i hate pasta and pasta is amazing"]
newdoc = vec.transform(newdoc)
model.predict_proba(newdoc)

array([[0.99813265, 0.00186735]])

In [23]:
newdoc = ["i hate pasta and pasta is bad"]
newdoc = vec.transform(newdoc)
model.predict_proba(newdoc)

array([[9.99868777e-01, 1.31223018e-04]])

In [24]:
new_doc = ["I am good good bad fellow"]
new_doc = vec.transform(new_doc)
model.predict_proba(new_doc)

array([[0.19106088, 0.80893912]])

In [25]:
new_doc.toarray()

array([[1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [26]:
new_doc = ["Pasta is good"]
new_doc = vec.transform(new_doc)
model.predict(new_doc)

array(['Positive'], dtype='<U8')

In [27]:
new_doc = ["Pasta is not good"]
new_doc = vec.transform(new_doc)
model.predict(new_doc)

array(['Negative'], dtype='<U8')

In [28]:
new_doc = ["Pasta is bad"]
new_doc = vec.transform(new_doc)
model.predict(new_doc)

array(['Negative'], dtype='<U8')

In [29]:
new_doc = ["Pasta is not bad"]
new_doc = vec.transform(new_doc)
model.predict(new_doc)

array(['Negative'], dtype='<U8')

In [30]:
new_doc = ["My name is Anshu and Pasta is good"]
new_doc = vec.transform(new_doc)
model.predict(new_doc)

array(['Positive'], dtype='<U8')

In [31]:
new_doc = ["My name is Anshu and Pasta is not good"]
new_doc = vec.transform(new_doc)
model.predict(new_doc)

array(['Negative'], dtype='<U8')

In [32]:
new_doc = ["My name is not Anshu and Pasta is good"]
new_doc = vec.transform(new_doc)
model.predict(new_doc)

array(['Negative'], dtype='<U8')

In [33]:
new_doc = ["I like pasta but this pasta is not great"]
new_doc = vec.transform(new_doc)
model.predict_proba(new_doc)

array([[0.61916001, 0.38083999]])

In [34]:
newdoc = ["I like the pasta, but it is not great"]
newdoc = vec.transform(newdoc)
print(f"probabilty:{model.predict_proba(newdoc)}")
model.predict(newdoc)

probabilty:[[0.61916001 0.38083999]]


array(['Negative'], dtype='<U8')

In [35]:
import joblib
joblib.dump(vec,'webapp/vectorizer.pkl')
joblib.dump(model,'webapp/sentiment_model.pkl')

['webapp/sentiment_model.pkl']