# classification

#### load data, categorize the overall column in sentiment

In [2]:
import pandas as pd
df = pd.read_json('Books_small.json', lines=True)
df['sentiment'] = 'NEGATIVE'
df.loc[(df.overall == 3), ['sentiment']] = 'NEUTRAL'
df.loc[(df.overall >  3), ['sentiment']] = 'POSITIVE'
# find the number of positive and negative reviews
print(df.groupby(['sentiment']).count()['asin'])
df

sentiment
NEGATIVE     62
NEUTRAL     103
POSITIVE    835
Name: asin, dtype: int64


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment
0,A1E5ZR1Z4OQJG,1495329321,"Pure Jonel ""Pure Jonel""","[0, 0]",Da Silva takes the divine by storm with this u...,4,An amazing first novel,1396137600,"03 30, 2014",POSITIVE
1,A30PZPI6FPH0A7,0399157565,Jackmollie,"[0, 0]",For me personally it's the most disappointing ...,2,disappointed,1400112000,"05 15, 2014",NEGATIVE
2,A1GQ2UI5BKCCRD,0984528105,Gail Hodges,"[0, 0]","Very simple book, but leaves you feeling good....",4,Good book,1401235200,"05 28, 2014",POSITIVE
3,A2DF4LQQI6KSQ2,0804139024,Olga,"[0, 0]",I read a library copy of this exceptionally we...,5,Science Fiction at its best!,1396483200,"04 3, 2014",POSITIVE
4,A1UAMAWY966P2,0765317583,Nadyne M Ichimura,"[0, 0]",With the government knowing this could happen ...,5,Excellent story,1397001600,"04 9, 2014",POSITIVE
...,...,...,...,...,...,...,...,...,...,...
995,A1Q3P5W409XK8I,0385346824,Wright,"[0, 0]",I thoroughly enjoyed this book. I've read the ...,5,A very fun read.,1404691200,"07 7, 2014",POSITIVE
996,AOFXJE92VZHJA,B00J9312KQ,"LaVonMoffett ""heavy reader""","[0, 0]",I was impressed with not only the characters o...,5,Loved the story,1400371200,"05 18, 2014",POSITIVE
997,A1OH43Q6Q05ULQ,0553808036,Jaci,"[0, 0]",I like the characters. I had read the short s...,5,Angels among us?,1396915200,"04 8, 2014",POSITIVE
998,A34QXHSIS1IFNS,B00BPTTFOW,H.Smith,"[8, 9]","She got way she wanted,but can she pick the ri...",5,Which one is the real one,1401408000,"05 30, 2014",POSITIVE


# split dataframe into training and test data

In [2]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(df, test_size = 0.33, random_state=42)
training

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment
703,A28VCFPVVSFEYB,1463561520,Amazon Customer,"[0, 0]",Vivid characters and descriptions. The author ...,5,Great Read,1393372800,"02 26, 2014",POSITIVE
311,A1WZU6M69O5R8V,0060774592,"Zhi ""zhi""","[0, 0]",Oh this is an awful bookA delightful awful boo...,5,This is a terrible book. We love it.,1393977600,"03 5, 2014",POSITIVE
722,A3MISPG4YIPYU1,1848191162,"Matthew Russell ""Matthew Russell""","[4, 4]",Mitchell has really gifted us with a gem. He p...,5,A wonderful collection of equally accessible a...,1393113600,"02 23, 2014",POSITIVE
629,A127GQTJI4FP6Q,0671207148,alwayslearning,"[0, 0]",Felt like I was floating down the river...Soli...,4,Wow,1388707200,"01 3, 2014",POSITIVE
0,A1E5ZR1Z4OQJG,1495329321,"Pure Jonel ""Pure Jonel""","[0, 0]",Da Silva takes the divine by storm with this u...,4,An amazing first novel,1396137600,"03 30, 2014",POSITIVE
...,...,...,...,...,...,...,...,...,...,...
106,A3V5KBIS9TWUVY,039916393X,"Tina Says ""Tina Says""","[1, 1]",Tracy Holczer's The Secret Hum of a Daisy is t...,5,The Secret Hum of a Daisy: An Absolute Must Read,1402272000,"06 9, 2014",POSITIVE
270,A2DSNXBJ9B2HAD,1442365528,Professor Bob,"[0, 0]","Red Sparrow is a grim, realistic, violent and ...",4,"Grim, realistic and suspenseful!",1402876800,"06 16, 2014",POSITIVE
860,A30DZJBG8ZEGHT,1844137864,Tab,"[0, 10]","This was a gift, but some of the art inside is...",1,Don't agree with all of the art.,1393977600,"03 5, 2014",NEGATIVE
435,A11ZIYONP0ENPG,1939416019,Francois Naude,"[1, 2]",A fast flowing story of heroes and villains. I...,3,A quest of heroes,1391299200,"02 2, 2014",NEUTRAL


# vectorize text using bag of words

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
trainingText = vectorizer.fit_transform(training.reviewText)
testText = vectorizer.transform(test.reviewText)
testText

<330x7372 sparse matrix of type '<class 'numpy.int64'>'
	with 18336 stored elements in Compressed Sparse Row format>

# classify text

### linear SVM

In [4]:
from sklearn.svm import SVC
clf_svm = SVC(kernel='linear')
clf_svm.fit(trainingText, training.sentiment)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

#### predict some text

In [5]:
badReview = ['This stuff is awful! Do not buy! Awful!', 'I hate it', 'What a waste of money']
badReviewText = vectorizer.transform(badReview)
clf_svm.predict(badReviewText)

array(['POSITIVE', 'POSITIVE', 'POSITIVE'], dtype=object)

#### decision tree

In [6]:
from sklearn.tree import DecisionTreeClassifier
clf_dectree = DecisionTreeClassifier()
clf_dectree.fit(trainingText, training.sentiment)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

#### predict from dec tree

In [7]:
clf_dectree.predict(badReviewText)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE'], dtype=object)

#### naive Bayes

In [8]:
from sklearn.naive_bayes import GaussianNB
clf_nb = GaussianNB()
clf_nb.fit(trainingText.toarray(), training.sentiment)

GaussianNB(priors=None, var_smoothing=1e-09)

#### predict from nb

In [9]:
clf_nb.predict(badReviewText.toarray())

array(['POSITIVE', 'NEUTRAL', 'NEGATIVE'], dtype='<U8')

#### logistic regression

In [10]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
clf_lr.fit(trainingText, training.sentiment)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#### predict from lr

In [11]:
clf_lr.predict(badReviewText)

array(['POSITIVE', 'POSITIVE', 'POSITIVE'], dtype=object)

# evaluation

In [12]:
print('SVM score:', clf_svm.score(testText, test.sentiment))
print('Dec tree score:', clf_dectree.score(testText, test.sentiment))
print('NB score:', clf_nb.score(testText.toarray(), test.sentiment))
print('LR score:', clf_lr.score(testText, test.sentiment))

SVM score: 0.8242424242424242
Dec tree score: 0.7393939393939394
NB score: 0.8121212121212121
LR score: 0.8303030303030303


In [13]:
from sklearn.metrics import f1_score
print ('SVM:', f1_score(test.sentiment, clf_svm.predict(testText), average=None, labels=['POSITIVE', 'NEUTRAL', 'NEGATIVE']))
print ('DT:', f1_score(test.sentiment, clf_dectree.predict(testText), average=None, labels=['POSITIVE', 'NEUTRAL', 'NEGATIVE']))
print ('NB:', f1_score(test.sentiment, clf_nb.predict(testText.toarray()), average=None, labels=['POSITIVE', 'NEUTRAL', 'NEGATIVE']))
print ('LR:', f1_score(test.sentiment, clf_lr.predict(testText), average=None, labels=['POSITIVE', 'NEUTRAL', 'NEGATIVE']))

SVM: [0.91319444 0.21052632 0.22222222]
DT: [0.86021505 0.11940299 0.        ]
NB: [0.89678511 0.08510638 0.09090909]
LR: [0.91370558 0.12244898 0.1       ]


# balance number of positive and negative reviews

In [14]:
negative = df[(df.sentiment == 'NEGATIVE')]
#print (negative)
positive = df[(df.sentiment == 'POSITIVE')][:len(negative)]
#print (positive)
neutral = df[(df.sentiment == 'NEUTRAL')][:len(negative)]
balanced = pd.concat([negative, positive, neutral])
#print (balanced)
import sklearn.utils
balanced = sklearn.utils.shuffle(balanced)
balanced

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment
471,A3N9LU1Y0FWOGU,B00IYYK2JS,"Read4ever ""G""","[0, 0]","I've only read a few pages so far, so I can't ...",2,Truthfully,1395100800,"03 18, 2014",NEGATIVE
468,A1WVO00PIPF8RA,0385536518,Maggot Lady,"[1, 2]",Seriously... I do not know why I even read it....,2,Dexter Downhill,1388966400,"01 6, 2014",NEGATIVE
55,A1098Z3D7ENJ2F,B00GVQOJZ4,veronica mostel,"[0, 0]",WOW. THESE BOOKS ARE GREAT. CANNOT WAIT FOR ...,5,LOVED IT,1398038400,"04 21, 2014",POSITIVE
28,A3F1T8R9CVNPSN,0316098329,Eileen M Brisbane,"[0, 0]",What a beautifully written book from a child's...,5,Sad yet moving,1400112000,"05 15, 2014",POSITIVE
7,ATT15IFF1UBAQ,0060781939,"A. Nuhanovic ""AlNuhano""","[0, 0]",It was good....there is a lot going on with mu...,4,Very fast paced,1397606400,"04 16, 2014",POSITIVE
...,...,...,...,...,...,...,...,...,...,...
555,ANJ8YJQP5GRRF,0528853392,Lisa H.,"[0, 1]",Got rid of this obsolete crap as soon as the c...,1,Who knows?,1398470400,"04 26, 2014",NEGATIVE
283,A1JMXGA0C0LR2C,0800734297,Ronnilu,"[0, 0]","As a Christian, I enjoy a well-written, exciti...",3,So-So,1391385600,"02 3, 2014",NEUTRAL
313,A3GBNOO6OAY0B1,1628548614,Gina M.,"[1, 1]",Definitely a man&#8217;s book. A narrative of...,3,A quick Read,1394064000,"03 6, 2014",NEUTRAL
13,AL3Y69LODZTP8,0449908585,"Dropletform ""Drop""","[0, 0]","I liked the interwoven personal drama, set off...",4,More good stuff from Theroux,1397606400,"04 16, 2014",POSITIVE


# make a bag of words for balanced dataframe

In [15]:
balancedText = vectorizer.fit_transform(balanced.reviewText)
badReviewText = vectorizer.transform(badReview)

# classify balanced data

In [21]:
clf_svm.fit(balancedText, balanced.sentiment)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [22]:
clf_svm.predict(badReviewText)

array(['NEGATIVE', 'NEGATIVE', 'NEGATIVE'], dtype=object)

# OMG! now try with decision tree!

In [23]:
clf_dectree.fit(balancedText, balanced.sentiment)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [25]:
clf_dectree.predict(badReviewText)

array(['NEGATIVE', 'NEGATIVE', 'NEGATIVE'], dtype=object)

# Wow - now with naive bayes

In [28]:
clf_nb.fit(balancedText.toarray(), balanced.sentiment)

GaussianNB(priors=None, var_smoothing=1e-09)

In [31]:
clf_nb.predict(badReviewText.toarray())

array(['NEGATIVE', 'NEUTRAL', 'NEGATIVE'], dtype='<U8')

# finally, with logistic regression

In [33]:
clf_lr.fit(balancedText.toarray(), balanced.sentiment)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
clf_lr.predict(badReviewText)

array(['NEGATIVE', 'POSITIVE', 'NEGATIVE'], dtype=object)