# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score,KFold
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

## Importing the dataset

In [26]:
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

## Cleaning the texts

In [4]:
!pip install nltk

Collecting nltk

You should consider upgrading via the 'c:\users\test\appdata\local\programs\python\python38-32\python.exe -m pip install --upgrade pip' command.



  Downloading nltk-3.5.zip (1.4 MB)
Collecting click
  Using cached click-7.1.2-py2.py3-none-any.whl (82 kB)
Collecting regex
  Downloading regex-2020.7.14-cp38-cp38-win32.whl (248 kB)
Collecting tqdm
  Downloading tqdm-4.48.2-py2.py3-none-any.whl (68 kB)
Using legacy setup.py install for nltk, since package 'wheel' is not installed.
Installing collected packages: click, regex, tqdm, nltk
    Running setup.py install for nltk: started
    Running setup.py install for nltk: finished with status 'done'
Successfully installed click-7.1.2 nltk-3.5 regex-2020.7.14 tqdm-4.48.2


In [28]:
import re
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
for i in range(len(df)):
    #Text Normalisation
    review = re.sub('[^A-Za-z]',' ',df['Review'][i])
    
    #Text Tokenisation
    review = review.lower().split()
    
    #stopwords
    allstopwords = stopwords.words('english')
    allstopwords.remove('not')
    
    #Stemming
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(allstopwords)]
    review = ' '.join(review)
    
    corpus.append(review)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TEST\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating the Bag of Words model

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, 1].values

## Splitting the dataset into the Training set and Test set

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the model on the Training set

In [31]:
models = []
models.append(('KNN',KNeighborsClassifier(n_neighbors=9)))
models.append(('Lreg',LogisticRegression()))
models.append(('SVM',SVC(kernel='linear')))
models.append(('KVM',SVC(kernel='rbf')))
models.append(('GNB',GaussianNB()))
models.append(('DTC',DecisionTreeClassifier(criterion='entropy')))
models.append(('RFC',RandomForestClassifier(n_estimators=11,criterion='entropy')))

for name,model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(f'{name}\n Accuracy: {accuracy_score(y_test,y_pred)}\n\n Confusion Matrix:-\n{confusion_matrix(y_test,y_pred)}\nClassification Report:-\n{classification_report(y_test,y_pred)}\n')
    

KNN
 Accuracy: 0.64

 Confusion Matrix:-
[[73 24]
 [48 55]]
Classification Report:-
              precision    recall  f1-score   support

           0       0.60      0.75      0.67        97
           1       0.70      0.53      0.60       103

    accuracy                           0.64       200
   macro avg       0.65      0.64      0.64       200
weighted avg       0.65      0.64      0.64       200


Lreg
 Accuracy: 0.78

 Confusion Matrix:-
[[80 17]
 [27 76]]
Classification Report:-
              precision    recall  f1-score   support

           0       0.75      0.82      0.78        97
           1       0.82      0.74      0.78       103

    accuracy                           0.78       200
   macro avg       0.78      0.78      0.78       200
weighted avg       0.78      0.78      0.78       200


SVM
 Accuracy: 0.79

 Confusion Matrix:-
[[79 18]
 [24 79]]
Classification Report:-
              precision    recall  f1-score   support

           0       0.77      0.81   

In [32]:
classifier = SVC(kernel='linear')
classifier.fit(X_train,y_train)

SVC(kernel='linear')

In [39]:
def sentiment_analysis(sentence):
    new_review = sentence
    new_review = re.sub('[^a-zA-Z]', ' ', new_review)
    new_review = new_review.lower().split()
    
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    
    ps = PorterStemmer()
    new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
    new_review = ' '.join(new_review)
    new_corpus = [new_review]
    
    new_X_test = cv.transform(new_corpus).toarray()
    new_y_pred = classifier.predict(new_X_test)
    
    if new_y_pred in [1]:
        return 'Positive'
    elif new_y_pred in [0]:
        return 'Negative'
    else:
        return 'Neutral'

In [40]:
sentiment_analysis('i love you')

'Positive'

In [41]:
valid = df.sample(30)
valid

Unnamed: 0,Review,Liked
806,I could eat their bruschetta all day it is dev...,1
684,Damn good steak.,1
334,I love this place.,1
759,The service was poor and thats being nice.,0
31,This was like the final blow!,0
974,"Your servers suck, wait, correction, our serve...",0
623,a drive thru means you do not want to wait aro...,0
597,It'll be a regular stop on my trips to Phoenix!,1
887,"After 20 minutes wait, I got a table.",0
550,"I really do recommend this place, you can go w...",1


In [42]:
valid = valid.reset_index()
valid = valid.drop('index',axis=1)
valid

Unnamed: 0,Review,Liked
0,I could eat their bruschetta all day it is dev...,1
1,Damn good steak.,1
2,I love this place.,1
3,The service was poor and thats being nice.,0
4,This was like the final blow!,0
5,"Your servers suck, wait, correction, our serve...",0
6,a drive thru means you do not want to wait aro...,0
7,It'll be a regular stop on my trips to Phoenix!,1
8,"After 20 minutes wait, I got a table.",0
9,"I really do recommend this place, you can go w...",1


In [43]:
data = []

for i in range(len(valid)):
    sentence = valid.Review[i]
    result = sentiment_analysis(sentence)
    label = valid.Liked[i]
    report = [sentence,label,result]
    data.append(report)
    
validation = pd.DataFrame(data,columns=['Text','Label','Predicted Label'])

In [44]:
validation

Unnamed: 0,Text,Label,Predicted Label
0,I could eat their bruschetta all day it is dev...,1,Positive
1,Damn good steak.,1,Positive
2,I love this place.,1,Positive
3,The service was poor and thats being nice.,0,Negative
4,This was like the final blow!,0,Positive
5,"Your servers suck, wait, correction, our serve...",0,Negative
6,a drive thru means you do not want to wait aro...,0,Negative
7,It'll be a regular stop on my trips to Phoenix!,1,Positive
8,"After 20 minutes wait, I got a table.",0,Negative
9,"I really do recommend this place, you can go w...",1,Positive
