# Natural Language Processing

## Importing the libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t',quoting =3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
dataset.shape

(1000, 2)

## Cleaning the texts

In [4]:
import re # regular expressions
import nltk # natural langage toolkit
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/MacBook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
corpus = []
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
corpus[0]

'wow love place'

In [7]:
corpus[1]

'crust good'

## Creating the Bag of Words model

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1].values

In [9]:
X.shape

(1000, 1500)

## Splitting the dataset into the Training set and Test set

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0) 

## Classifiers and predictions

In [11]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-2.0.3-py3-none-macosx_12_0_arm64.whl.metadata (2.0 kB)
Using cached xgboost-2.0.3-py3-none-macosx_12_0_arm64.whl (1.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3


In [12]:
from xgboost import XGBClassifier

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [14]:
DT = DecisionTreeClassifier()
DT.fit(X_train,y_train)
yDT = DT.predict(X_test)

In [15]:
print(classification_report(y_test,yDT))

              precision    recall  f1-score   support

           0       0.64      0.76      0.70        97
           1       0.73      0.60      0.66       103

    accuracy                           0.68       200
   macro avg       0.69      0.68      0.68       200
weighted avg       0.69      0.68      0.68       200


In [16]:
KNN = KNeighborsClassifier()
KNN.fit(X_train,y_train)
yKNN = KNN.predict(X_test)

In [17]:
print(classification_report(y_test,yKNN))

              precision    recall  f1-score   support

           0       0.56      0.72      0.63        97
           1       0.64      0.46      0.53       103

    accuracy                           0.58       200
   macro avg       0.60      0.59      0.58       200
weighted avg       0.60      0.58      0.58       200


In [18]:
NB = GaussianNB()
NB.fit(X_train,y_train)
yNB = NB.predict(X_test)

In [19]:
print(classification_report(y_test,yNB))

              precision    recall  f1-score   support

           0       0.82      0.57      0.67        97
           1       0.68      0.88      0.77       103

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.73      0.72       200


In [20]:
XGB = XGBClassifier()
XGB.fit(X_train,y_train)
yXGB = XGB.predict(X_test)

In [21]:
print(classification_report(y_test,yXGB))

              precision    recall  f1-score   support

           0       0.66      0.86      0.75        97
           1       0.81      0.59      0.69       103

    accuracy                           0.72       200
   macro avg       0.74      0.72      0.72       200
weighted avg       0.74      0.72      0.72       200


## TF IDF vectorizer 

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1500)
X2 = tfidf.fit_transform(corpus).toarray()
y2 = dataset.iloc[:,1].values

In [23]:
from sklearn.model_selection import train_test_split
X2_train,X2_test,y2_train,y2_test = train_test_split(X2,y2,test_size=0.2,random_state=0) 

In [24]:
DT2 = DecisionTreeClassifier()
DT2.fit(X2_train,y2_train)
y2DT = DT2.predict(X2_test)

In [25]:
print(classification_report(y2_test,y2DT))

              precision    recall  f1-score   support

           0       0.66      0.75      0.71        97
           1       0.73      0.64      0.68       103

    accuracy                           0.69       200
   macro avg       0.70      0.70      0.69       200
weighted avg       0.70      0.69      0.69       200


In [26]:
KNN2 = KNeighborsClassifier()
KNN2.fit(X2_train,y2_train)
y2KNN = KNN2.predict(X2_test)

AttributeError: 'NoneType' object has no attribute 'split'

In [None]:
print(classification_report(y2_test,y2KNN))

In [None]:
NB2 = GaussianNB()
NB2.fit(X2_train,y2_train)
y2NB = NB2.predict(X2_test)

In [None]:
print(classification_report(y2_test,y2NB))

In [None]:
XGB2 = XGBClassifier()
XGB2.fit(X2_train,y2_train)
y2XGB = XGB2.predict(X2_test)

In [None]:
print(classification_report(y2_test,y2XGB))

## Example

In [None]:
review = 'I didn\'t like the pizzas'
review = re.sub('[^a-zA-Z]',' ',review)
review = review.lower()
review = review.split()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
review

In [None]:
tfidf = TfidfVectorizer()
example = tfidf.fit_transform([review]).toarray()

In [None]:
example

**Exercise**