# CountVectorizer is a Transformer that is used to Transform text sentences into numbers that can later be fed in Logistic regression or any other Machine learning model

In [1]:
import pandas as pd
import re #re here is the regex module of python
import numpy as np

In [2]:
df = pd.read_csv('Restaurant_Reviews.tsv', sep='\t') # define seprator as \t

In [3]:
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [4]:
df.shape

(1000, 2)

In [5]:
df.Liked.value_counts() ## Cheking the number of liked and disliked reviews in the dataset

0    500
1    500
Name: Liked, dtype: int64

In [116]:
def clean(x):
    s = re.sub('<*?>',' ', x) #removes all html tags
    
#     s = re.sub("wasn't", 'was not',s) #removes wasn't to was not
#     s = re.sub("don't", 'do not',s)
#     s = re.sub("din't", 'did not',s)
#     s = re.sub("wouldn't", 'would not',s)
#     s = re.sub("won't", 'will not',s)
#     s = re.sub("isn't", 'is not',s)
#     s = re.sub("needn't", 'need not',s)
#     s = re.sub("hasn't", 'has not',s)
    
    s = re.sub('[^A-Za-z]',' ',s) #removes everything else except for a to z and A to Z
    s = re.sub('\s+',' ',s) #removes more than one spaces
    s = s.strip() #removes spaces from the beginning and ending of the sentences
    return s.lower()

In [117]:
clean("The Food wasn't bad")

'the food wasn t bad'

In [118]:
df['Review'] = df.Review.apply(clean)

In [119]:
df

Unnamed: 0,Review,Liked
0,wow loved this place,1
1,crust is not good,0
2,not tasty and the texture was just nasty,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1
...,...,...
995,i think food should have flavor and texture an...,0
996,appetite instantly gone,0
997,overall i was not impressed and would not go back,0
998,the whole experience was underwhelming and i t...,0


In [120]:
x = df.Review.values

In [121]:
x.shape

(1000,)

In [122]:
x.ndim

1

In [123]:
y = df.Liked.values

In [124]:
y.shape

(1000,)

In [125]:
y.ndim

1

## Split data

In [126]:
from sklearn.model_selection import train_test_split

In [127]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = 0.20,random_state=12)

## Now Apply CountVectorizer

In [128]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords') #Run this command to download and install Stopwords from natural language processor
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/Machine_learning/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [129]:
words = stopwords.words('english')

In [130]:
# getting stopwords from nltk 
for i in list(words):
    print(i,end=',')

i,me,my,myself,we,our,ours,ourselves,you,you're,you've,you'll,you'd,your,yours,yourself,yourselves,he,him,his,himself,she,she's,her,hers,herself,it,it's,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,that'll,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,don't,should,should've,now,d,ll,m,o,re,ve,y,ain,aren,aren't,couldn,couldn't,didn,didn't,doesn,doesn't,hadn,hadn't,hasn,hasn't,haven,haven't,isn,isn't,ma,mightn,mightn't,mustn,mustn't,needn,needn't,shan,shan't,shouldn,shouldn't,wasn,wasn't,weren,weren't,won,won't,wouldn,wouldn't,

In [131]:
#checking if now, no, don't , do not , not are present or not
if 'no' in words:
    print("no is present")
if 'not' in words:
    print("not is present")
if 'never' in words:
    print("never is present")
if "don't" in words:
    print("don't is present")
if 'do not' in words:
    print("do not is present")
if "din't" in words:
    print("din't is present")
if 'did not' in words:
    print("did not is present")
if "wouldn't" in words:
    print("wouldn't is present")
if 'would not' in words:
    print("would not is present")
if 'will not' in words:
    print("will not is present")
if "won't" in words:
    print("won't is present")
if "wasn't" in words:
    print("wasn't is present")
if "isn't" in words:
    print("isn't is present")
if "needn't" in words:
    print("needn't is present")
if "hasn't" in words:
    print("hasn't is present")

no is present
not is present
don't is present
wouldn't is present
won't is present
wasn't is present
isn't is present
needn't is present
hasn't is present


## now we have to make sure that all the negative words are removed from 'words = stopwords.words('english')' before we feed it to 'count_vectorizer = CountVectorizer(stop_words=words)'

In [132]:
#Removinf all the negative words from the "words" variable
negative_words = ['no','not','never', "don't", "do not", "din't", "did not", "wouldn't", "would not"]
if 'no' in words:
    words.remove("no")
if 'not' in words:
    words.remove("not")
if 'never' in words:
    words.remove("never")
if "don't" in words:
    words.remove("don't")
if 'do not' in words:
    words.remove("do not")
if "din't" in words:
    words.remove("din't")
if 'did not' in words:
    words.remove("did not")
if "wouldn't" in words:
    words.remove("wouldn't")
if 'would not' in words:
    words.remove("would not")
if 'will not' in words:
    words.remove("will not")
if "won't" in words:
    words.remove("won't")
if "wasn't" in words:
    words.remove("wasn't")
if "isn't" in words:
    words.remove("isn't")
if "needn't" in words:
    words.remove("needn't")
if "hasn't" in words:
    words.remove("hasn't")

In [133]:
#again rechecking if negative words are present or not 
negative_words = ['no','not','never', "don't", "do not", "din't", "did not", "wouldn't", "would not"]
if 'no' in words:
    print("no is present")
if 'not' in words:
    print("not is present")
if 'never' in words:
    print("never is present")
if "don't" in words:
    print("don't is present")
if 'do not' in words:
    print("do not is present")
if "din't" in words:
    print("din't is present")
if 'did not' in words:
    print("did not is present")
if "wouldn't" in words:
    print("wouldn't is present")
if 'would not' in words:
    print("would not is present")
if 'will not' in words:
    print("will not is present")
if "won't" in words:
    print("won't is present")
if "wasn't" in words:
    print("wasn't is present")
if "isn't" in words:
    print("isn't is present")
if "needn't" in words:
    print("needn't is present")
if "hasn't" in words:
    print("hasn't is present")
else:
    print("no negative words are present")

no negative words are present


In [134]:
count_vectorizer = CountVectorizer(stop_words=words) #initialization

In [135]:
cv_train = count_vectorizer.fit_transform(xtrain)
cv_test = count_vectorizer.transform(xtest)

In [136]:
cv_train.shape

(800, 1653)

In [137]:
cv_train.ndim

2

In [138]:
cv_test.shape

(200, 1653)

In [139]:
cv_test.ndim

2

## Now we can apply Logistic Regression Machine Learning model

In [140]:
from sklearn.linear_model import LogisticRegression

In [141]:
log = LogisticRegression() #initializing Model

In [142]:
log.fit(cv_train,ytrain) #training model

LogisticRegression()

In [143]:
score = log.score(cv_test,ytest) # scoring model

In [144]:
score

0.825

In [145]:
pred = log.predict(cv_test) #predicting 
pred

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1])

## Printing number of misspredicted data

In [146]:
print(np.where(ytest!=pred))

(array([  4,   6,  27,  32,  35,  36,  37,  60,  63,  65,  67,  72,  76,
        83,  86,  92, 100, 101, 115, 121, 122, 125, 134, 135, 137, 138,
       145, 155, 157, 163, 167, 169, 173, 176, 180]),)


## Testing the trained model on a custom input from the user

In [185]:
test = ["I wouldn't have eaten the food if I had known", 
        "I won't eat here ever again", 
        "The food was good", 
        "It was a good meal",
       "The food was not upto the mark like it used to in previous years", 
        "I should not have ordered these. It's bad...It's really bad",
       "Awesome Food !!!...Good service", 
        "I am satisfied with the food",
       "They needn't have to make the order process so complicated",
       "They din't have to go above and beyond but they did it was cool.....I loved it",
       "The meal wasn't satisfactory",
       "The food was bad"]
clean_data = []
for k in test:
    clean_data.append(clean(k))

In [186]:
test1 = count_vectorizer.transform(clean_data)

In [187]:
test1.shape

(12, 1653)

In [188]:
test1.ndim

2

In [189]:
log.predict(test1) # 0 means negative review and 1 means positive review

array([0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0])

In [169]:
# "I am satisfied with the food " should be a positive review but it's classifying it as a negative one [pos = 7]
# how to solve this issue