In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset=pd.read_csv("Restaurant_Reviews.tsv", delimiter="\t", quoting=3)
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [3]:
dataset["Liked"].value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

## First step to remove number,cheracter etc

In [5]:
review=re.sub('[^a-zA-Z]',' ', dataset['Review'][0])
review

'Wow    Loved this place '

## Second step is to convert all the word into lowercase

In [6]:
review=review.lower()
review

'wow    loved this place '

In [7]:
# nltk.download('stopwords')
# nltk.download('wordnet')

## Third step is to convert str into list

In [8]:
review=review.split()
review

['wow', 'loved', 'this', 'place']

## Fourth step is to remove punctuation like word e.g the, this, i , am, when etc

In [9]:
review=[word for word in review if not word in set(stopwords.words('english'))]

In [10]:
review

['wow', 'loved', 'place']

In [11]:
## REmember One Thing if we have paragraph contaning sentences 
#then we first convert the paragrap into sentence using 

# sentences=nltk.sent_tokenize(paragraph)

## Stemming Or Lemmatization

## Fifth Step is Stemming , e.g Loved to love, Jumped to jump

In [12]:
ps=PorterStemmer()

In [13]:
review=[ps.stem(word) for word in review]
# review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))]

In [14]:
review

['wow', 'love', 'place']

## OR

## Fifth Step is (Lemmatization) , e.g Loved to love, Jumped to jump

In [15]:
wordnet=WordNetLemmatizer()

In [16]:
review=[wordnet.lemmatize(word) for word in review]

In [17]:
review

['wow', 'love', 'place']

## Sixth step to convert list to str/sentence

In [18]:
review= " ".join(review )

In [19]:
review

'wow love place'

## Apply this to all rows of review column

In [20]:
# my_review=[]
# def preprossing(row):
#     my_review = re.sub('[^a-zA-Z]',' ', row)
#     my_review = my_review.lower()
#     my_review = my_review.split()
#     my_review = [ps.stem(word) for word in my_review if not word in set(stopwords.words('english'))]
#     my_review =  " ".join(my_review)
#     return my_review


my_review=[]
def preprossing(row):
    my_review = re.sub('[^a-zA-Z]',' ', row)
    my_review = my_review.lower()
    my_review = my_review.split()
    my_review = [wordnet.lemmatize(word) for word in my_review if not word in set(stopwords.words('english'))]
    my_review =  " ".join(my_review)
    return my_review

In [21]:
print(preprossing('Wow... Loved this place.'))

wow loved place


In [22]:
corpus=[]
for i in range(0,1000):
    corpus.append(preprossing(dataset['Review'][i]))
#     dataset['Review'][i] = preprossing(dataset['Review'][i])

In [23]:
corpus

['wow loved place',
 'crust good',
 'tasty texture nasty',
 'stopped late may bank holiday rick steve recommendation loved',
 'selection menu great price',
 'getting angry want damn pho',
 'honeslty taste fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fry great',
 'great touch',
 'service prompt',
 'would go back',
 'cashier care ever say still ended wayyy overpriced',
 'tried cape cod ravoli chicken cranberry mmmm',
 'disgusted pretty sure human hair',
 'shocked sign indicate cash',
 'highly recommended',
 'waitress little slow service',
 'place worth time let alone vega',
 'like',
 'burrittos blah',
 'food amazing',
 'service also cute',
 'could care le interior beautiful',
 'performed',
 'right red velvet cake ohhh stuff good',
 'never brought salad asked',
 'hole wall great mexican street taco friendly staff',
 'took hour get food table restaurant food luke warm sever running around like totally overwhelmed',
 'worst salmon sashimi',
 'also combo like burge

## Bag of words OR               TF IDF                  OR    Word Embedding

## Beg Of Words

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
cv=CountVectorizer(max_features=1500)
X=cv.fit_transform(corpus).toarray()

In [26]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [27]:
X.shape

(1000, 1500)

## TF IDF

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
cv=TfidfVectorizer()

In [30]:
X=cv.fit_transform(corpus).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
X.shape

(1000, 1766)

## Word Embedding

Word embeddings are in fact a class of techniques where individual words are represented as real-valued vectors in a predefined vector space. Each word is mapped to one vector and the vector values are learned in a way that resembles a neural network, and hence the technique is often lumped into the field of deep learning

In [32]:
from tensorflow.keras.preprocessing.text import one_hot

input_dim: This is the size of the vocabulary in the text data. For example, if your data is integer encoded to values between 0-10, then the size of the vocabulary would be 11 words.

In [33]:
voc_size=10000

<b>One hot representation</b>

In [34]:
onehot_rep=[one_hot(words, voc_size)for words in corpus]
print(onehot_rep)

[[4257, 8318, 881], [6838, 9189], [3601, 4368, 1736], [8535, 7520, 9668, 2449, 893, 645, 9540, 5261, 8318], [6120, 6529, 7186, 5174], [1636, 3980, 9310, 7939, 1346], [9194, 9930, 5052], [7440, 3550, 1738, 3129, 2219, 2390, 6253, 4408, 6641, 2424], [3867, 7186], [7186, 8757], [4779, 5510], [3645, 6976, 5755], [8728, 1635, 2336, 3453, 6142, 3565, 2518, 2676], [3133, 790, 6896, 247, 4983, 696, 2673], [8837, 7358, 6277, 9255, 9760], [3014, 3697, 4933, 1078], [8511, 3067], [5786, 1962, 9184, 4779], [881, 8204, 4408, 3797, 5860, 1345], [3550], [7899, 1645], [3394, 1255], [4779, 3077, 7392], [3129, 1635, 6142, 1378, 4992], [7100], [3230, 4906, 421, 3116, 1452, 6964, 9189], [7293, 6409, 1714, 6085], [9182, 9661, 7186, 8856, 6822, 8321, 8003, 7320], [8704, 2670, 1238, 3394, 9218, 673, 3394, 3058, 7432, 3119, 8073, 3452, 3550, 7979, 9428], [8938, 9648, 7660], [3077, 739, 3550, 9279, 3867, 3416, 4321, 8968], [3550, 4915, 6987], [6224, 881, 62, 3129, 8065], [8113, 3550, 9189, 1544, 881, 222, 2571,

<b>Word Embedding Representation</b>

In [35]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [79]:
a=[]
for i in range(1000):
    a.append(len(corpus[i].split()))
sent_length=max(a)

In [80]:
embedded_docs=pad_sequences(onehot_rep,padding='pre', maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 6237 9688 8545]
 [   0    0    0 ...    0 6204 9502]
 [   0    0    0 ... 6105 1173 6357]
 ...
 [   0    0    0 ... 6169 2323 2087]
 [   0    0    0 ... 1961 6412 7382]
 [   0    0    0 ... 1795 3122 9114]]


In [61]:
dim=10

In [63]:
model=Sequential()
model.add(Embedding(voc_size, 10, input_length=sent_length))
model.compile('adam','mse')

In [64]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 10)            100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [69]:
result=model.predict(embedded_docs)
print(result)

[[[ 0.00322932 -0.01077297  0.03735543 ...  0.02063513  0.03089878
    0.01727397]
  [ 0.00322932 -0.01077297  0.03735543 ...  0.02063513  0.03089878
    0.01727397]
  [ 0.00322932 -0.01077297  0.03735543 ...  0.02063513  0.03089878
    0.01727397]
  ...
  [ 0.01412339  0.04075604 -0.0042117  ... -0.04114478 -0.01811695
    0.04412165]
  [-0.02252216  0.04499967 -0.04849536 ...  0.0262407   0.04340738
    0.0242669 ]
  [ 0.03841965 -0.04359896 -0.02197182 ... -0.04893493 -0.01107935
   -0.00908631]]

 [[ 0.00322932 -0.01077297  0.03735543 ...  0.02063513  0.03089878
    0.01727397]
  [ 0.00322932 -0.01077297  0.03735543 ...  0.02063513  0.03089878
    0.01727397]
  [ 0.00322932 -0.01077297  0.03735543 ...  0.02063513  0.03089878
    0.01727397]
  ...
  [ 0.00322932 -0.01077297  0.03735543 ...  0.02063513  0.03089878
    0.01727397]
  [-0.01825241  0.01928217  0.03869425 ...  0.03181591  0.03247637
    0.04597796]
  [ 0.03979358  0.03821596  0.02738556 ...  0.01026771  0.00104434
    0.

In [72]:
result[0][0]

array([ 0.00322932, -0.01077297,  0.03735543, -0.0116779 , -0.04939819,
       -0.02824204, -0.04573393,  0.02063513,  0.03089878,  0.01727397],
      dtype=float32)

In [67]:
result.shape

(1000, 20, 10)

## Dependent Veriable

In [67]:
y=dataset.iloc[:,1].values
y

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,

In [68]:
from sklearn.model_selection import train_test_split

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [70]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [71]:
import keras
from keras.models import Sequential
from keras.layers import Dense

In [72]:
classifier=Sequential()
classifier.add(Dense(output_dim=750, init='uniform', activation='relu', input_dim=1500))
classifier.add(Dense(output_dim=750, init='uniform', activation='relu'))
classifier.add(Dense(output_dim=1, init='uniform', activation='sigmoid'))

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [73]:
classifier.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [74]:
hist = classifier.fit(X_train, y_train, batch_size=10, epochs=20) 
#batch size after no of iteration you want to update the weight

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [75]:
y_pred=classifier.predict(X_test)

In [76]:
y_pred

array([[3.61770391e-04],
       [1.13911033e-02],
       [1.28534466e-01],
       [0.00000000e+00],
       [1.29938126e-05],
       [1.13063246e-01],
       [1.00000000e+00],
       [1.61647797e-04],
       [5.73694706e-04],
       [9.99999762e-01],
       [5.71319163e-01],
       [1.00000000e+00],
       [9.71911669e-01],
       [9.99999225e-01],
       [9.99977410e-01],
       [1.00000000e+00],
       [9.42132354e-01],
       [2.55984068e-03],
       [0.00000000e+00],
       [9.99999881e-01],
       [3.56703997e-04],
       [0.00000000e+00],
       [9.99808252e-01],
       [9.99017119e-01],
       [9.99759614e-01],
       [9.77364898e-01],
       [3.37603688e-02],
       [1.00000000e+00],
       [9.99998569e-01],
       [3.92567456e-01],
       [9.99994040e-01],
       [9.99996960e-01],
       [7.31537879e-01],
       [9.99879062e-01],
       [9.99979377e-01],
       [1.00997090e-03],
       [6.18514419e-03],
       [5.14286757e-03],
       [3.51548195e-04],
       [1.00000000e+00],


In [77]:
rounded=[round(x[0]) for x in y_pred]
rounded

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0

In [78]:
df1 = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': rounded})
df1.head(25)

Unnamed: 0,Actual,Predicted
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0
5,0,0.0
6,1,1.0
7,0,0.0
8,0,0.0
9,1,1.0


In [79]:
y_test

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 1], dtype=int64)

In [80]:
rounded

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0

In [81]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, rounded)

In [82]:
cm

array([[73, 24],
       [23, 80]], dtype=int64)

In [83]:
accuracy=(76+71)/(76+21+32+71)
accuracy

0.735

In [84]:
# Precision = TP / (TP+FP)

In [85]:
precision=(71)/(32+71)
precision

0.6893203883495146

In [52]:
# Recall = TPR = TP / P = TP / (TP+FN)

In [60]:
# text= re.sub('\[[^]]*\]', '', text)
#     # remove stock market tickers like $GE
#     text = re.sub(r'\$\w*', '', text)
#     #removal of html tags
#     review =re.sub(r'<.*?>',' ',text) 
#     # remove old style retweet text "RT"
#     text = re.sub(r'^RT[\s]+', '', text)
#     # remove hyperlinks
#     text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
#     # remove hashtags
#     # only removing the hash # sign from the word
#     text = re.sub(r'#', '', text)
#     text = re.sub("["
#                            u"\U0001F600-\U0001F64F"  # removal of emoticons
#                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                            u"\U00002702-\U000027B0"
#                            u"\U000024C2-\U0001F251"
#                            "]+",' ',text)
#     text = re.sub('[^a-zA-Z]',' ',text) 