# Bag of words / Naive Bayes

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3) #no quote, ignore quotes, 3 anuluje 2

## Cleaning the texts

In [None]:
import re 
import nltk
nltk.download('stopwords') #najpierw pobrac, potem importowac
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer #zmiania odmiany na present tense loved-love
corpus = [] #zawsze zaczynamy od listy 
for i in range(0, 1000):
  review  = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) #replace anytjing in text, hat meant NOT!!, obok czym zamieniamy - access to JEDNA szczegolna review
  review = review.lower() #update review
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not') #usuwa not z tych stowords
  all_stopwords.remove("isn't")
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)] #get rid off stopwords, for loop in list with if condition (co nie chcemy stemowac), apply steaming do wszystkich slow oprocz stowords
  review = ' '.join(review) #dodajesz space
  corpus.append(review) #dodajesz review do corpusa

In [None]:
print(corpus)

## Creating the Bag of Words model

In [None]:
from sklearn.feature_extraction.text import  CountVectorizer
cv = CountVectorizer(max_features=1500) #instance of a class, pozbadz sie tych slow co nie sa czeste - wtedy usuonie slowa jak steve
X = cv.fit_transform(corpus).toarray() #takes all the words puts in columns
y = dataset.iloc[:, -1].values

In [None]:
len(X[0]) #number of elements in first row of an array - to sa wszystkie slowa najczestesze

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the Naive Bayes model on the Training set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

# Predicting if a single review is positive or negative

## Positive review

Use our model to predict if the following review:

"I love this restaurant so much"

is positive or negative.

Solution: We just repeat the same text preprocessing process we did before, but this time with a single review.

In [None]:
new_review = 'I love this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

The review was correctly predicted as positive by our model.

### Negative review

Use our model to predict if the following review:

"I hate this restaurant so much"

is positive or negative.





Solution: We just repeat the same text preprocessing process we did before, but this time with a single review.

In [None]:
new_review = 'I hate this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

The review was correctly predicted as negative by our model.

2. Evaluate the performance of each of these models. Try to beat the Accuracy obtained in the tutorial. But remember, Accuracy is not enough, so you should also look at other performance metrics like Precision (measuring exactness), Recall (measuring completeness) and the F1 Score (compromise between Precision and Recall). Please find below these metrics formulas (TP = # True Positives, TN = # True Negatives, FP = # False Positives, FN = # False Negatives):

Accuracy = (TP + TN) / (TP + TN + FP + FN)

Precision = TP / (TP + FP)

Recall = TP / (TP + FN)

F1 Score = 2 * Precision * Recall / (Precision + Recall)