# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', sep = '\t', quoting = 3)

## Cleaning the text

In [3]:
import re
import nltk
t = nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# This is line is moved here to prevent creating of many new instances of PorterStreamer object
ps = PorterStemmer()

filtered_stopwords = stopwords.words('english')
filtered_stopwords.remove('not')

corpus = []
for i in range(0, 1000):
    # review = re.sub('[^a-zA-Z]', ' ', dataset.iloc[i][0])
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    # ps = PorterStemmer() - this was moved outside of the loop
    
    # Mine approach
    stemmed_review = []
    for word in review:
        if not word in set(filtered_stopwords):
            word = ps.stem(word)
            stemmed_review.append(word)
    review = ' '.join(stemmed_review)
    
    corpus.append(review)
    
    # Lector's approach - Very bad readability
    # review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    # review = ' '.join(review)
    # print(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Oggy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating a Bag of words model

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
print(X.shape)

# y = dataset.iloc[:, 1] - I will use the other option
y = dataset['Liked']

(1000, 1500)


## Splitting the dataset

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

## Training a Naive Bayes model on the training set

In [10]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

## Applying the model on the testing set

In [11]:
y_test_pred = classifier.predict(X_test)

## Displaying the confusuion matrix and model statistics

In [12]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, matthews_corrcoef, cohen_kappa_score, roc_auc_score

cm_test = confusion_matrix(y_test, y_test_pred)
cr_test = classification_report(y_test, y_test_pred)
print(cm_test)
print(cr_test)

acc_test = accuracy_score(y_test, y_test_pred)
prec_test = precision_score(y_test, y_test_pred)
ck_test = cohen_kappa_score(y_test, y_test_pred)
mcc_test = matthews_corrcoef(y_test, y_test_pred)
roc_auc_test = roc_auc_score(y_test, y_test_pred)
print(f'Accuracy = {acc_test:.2f}; Precision = {prec_test:.2f}; Kappa = {ck_test:.2f}; MCC = {mcc_test:.2f}; ROC AUC = {roc_auc_test:.2f}')

[[55 42]
 [12 91]]
              precision    recall  f1-score   support

           0       0.82      0.57      0.67        97
           1       0.68      0.88      0.77       103

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.73      0.72       200

Accuracy = 0.73; Precision = 0.68; Kappa = 0.45; MCC = 0.48; ROC AUC = 0.73
