# text classification - sentiment analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
url = "https://raw.githubusercontent.com/anshupandey/Machine_Learning_Training/master/datasets/sentimentdata.csv"
df = pd.read_csv(url)
df.shape

(20, 2)

In [33]:
df

Unnamed: 0,document,label
0,Pizza is great and I love pizza.,Positive
1,I hate burger and its bad to eat burger.,Negative
2,I hate dirty tables.,Negative
3,Burger is amazing and I love it more than anyt...,Positive
4,My boss is a monster and I hate him,Negative
5,The food was delivered late and I hate late de...,Negative
6,My wife love pizza and burger more than me,Positive
7,the table was bad and dirty and i hate this,Negative
8,Food was delicious and I love it,Positive
9,It great to have good food at good time,Positive


In [4]:
x = df.document
y = df.label

## Vectorization

In [21]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
stop_dic = list(ENGLISH_STOP_WORDS)
stop_dic.remove('not')

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(lowercase=True,stop_words=stop_dic)
vec.fit(x)

print(len(vec.get_feature_names()))
print(vec.get_feature_names())

25
['amazing', 'bad', 'boss', 'burger', 'delicious', 'delivered', 'delivery', 'dirty', 'eat', 'food', 'good', 'great', 'hate', 'icecream', 'juice', 'ketchup', 'late', 'love', 'monster', 'not', 'pizza', 'table', 'tables', 'time', 'wife']


In [24]:
x2 = vec.transform(x).toarray()
print(x2.shape)

(20, 25)


In [25]:
pd.DataFrame(x2,columns=vec.get_feature_names()).head(5)

Unnamed: 0,amazing,bad,boss,burger,delicious,delivered,delivery,dirty,eat,food,...,ketchup,late,love,monster,not,pizza,table,tables,time,wife
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,2,0,0,0,0
1,0,1,0,2,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


## Apply Machine learning

In [26]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200,random_state=5)
model.fit(x2,y)

RandomForestClassifier(n_estimators=200, random_state=5)

In [27]:
newdoc = ['pasta is good and i love pasta']
model.predict(vec.transform(newdoc))

array(['Positive'], dtype=object)

In [28]:
newdoc = ['pasta is bad and i hate pasta']
model.predict(vec.transform(newdoc))

array(['Negative'], dtype=object)

In [29]:
newdoc = ['pasta is good']
model.predict(vec.transform(newdoc))

array(['Positive'], dtype=object)

In [30]:
newdoc = ['pasta is not good']
model.predict(vec.transform(newdoc))

array(['Negative'], dtype=object)

In [31]:
newdoc = ['pasta is bad']
model.predict(vec.transform(newdoc))

array(['Negative'], dtype=object)

In [32]:
newdoc = ['pasta is not bad']
model.predict(vec.transform(newdoc))

array(['Positive'], dtype=object)

In [34]:
newdoc = ['pasta is not bad. I like pizza. ']
model.predict(vec.transform(newdoc))

array(['Positive'], dtype=object)

In [35]:
import nltk

In [38]:
newdoc = 'pasta is bad. I like burget and love it.'

for i in nltk.sent_tokenize(newdoc):
    print(i, model.predict(vec.transform([i])))

pasta is bad. ['Negative']
I like burget and love it. ['Positive']


In [40]:
newdoc = ['pasta is good. I like pizza. ']
model.predict_proba(vec.transform(newdoc))

array([[0.135, 0.865]])

In [41]:
newdoc = ['pasta is bad. I like pizza. ']
model.predict_proba(vec.transform(newdoc))

array([[0.41, 0.59]])

In [42]:
newdoc = ['pasta is bad. I hate pizza. ']
model.predict_proba(vec.transform(newdoc))

array([[0.76, 0.24]])