# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('dataset.tsv', delimiter = '\t', quoting = 3)

## Cleaning the texts

In [22]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(len(dataset['Review'])):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /home/aac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
len(corpus)

1494

In [24]:
print(corpus[:10])

['girl mani local u r virgin r readi fil ur everi sexual need u fil text cute p', 'amaz lj smith fantast author inspir other creat stori poetri show true brillianc daughter dark special qualiti induc feel love adventur time would not surpris peopl respond well novel', 'guess mysteri word lack style write left unabl appreci underli stori result found rather vacuou inauthent much felt compel come write review book other might avoid make mistak offenc patricia', 'custom loyalti offer new nokia mobil txtauction txt word start get ctxt tc p mtmsg', 'big brother alert comput select u k cash voucher call ntt po box cro bt landlin cost ppm mobil vari', 'free game get rayman golf free game arcad st get ur game set repli post save activ press key arcad termsappli', 'urgent costa del sol holiday await collect call toclaim sae tc pobox stockport sk xh cost pm max min', 'best yet daughter dark best yet happen pick mistak librari got hook right away must read time love way ash tri protect jermey sav

## Creating the Bag of Words model

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [26]:
print(X.shape)
print(y.shape)

(1494, 1500)
(1494,)


## Splitting the dataset into the Training set and Test set

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

## Training the Naive Bayes model on the Training set

In [28]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

## Predicting the Test set results

In [29]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]

## Making the Confusion Matrix

In [30]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[152   1]
 [  5 141]]


0.979933110367893

# predicting

In [56]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
import re

review = "RT-KIng Pro Video Club>> Need help? info@ringtoneking.co.uk or call 08701237397 You must be 16+ Club credits redeemable at www.ringtoneking.co.uk! Enjoy!"

nltk.download('stopwords')

review = re.sub('[^a-zA-Z]', ' ', review)
review = review.lower()
review = review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
review = ' '.join(review)

vectors = cv.transform([review]).toarray()

classifier.predict(vectors)

[nltk_data] Downloading package stopwords to /home/aac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


array([1])

# saving

In [61]:
import pickle

with open('model.pkl', 'w+b') as f:
    pickle.dump({
        "cv": cv,
        "classifier": classifier
    }, f)

# loading

In [3]:
import pickle
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
import re

nltk.download('stopwords')

with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

    
review = "RT-KIng Pro Video Club>> Need help? info@ringtoneking.co.uk or call 08701237397 You must be 16+ Club credits redeemable at www.ringtoneking.co.uk! Enjoy!"


review = re.sub('[^a-zA-Z]', ' ', review)
review = review.lower()
review = review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
review = ' '.join(review)

vectors = model['cv'].transform([review]).toarray()

model['classifier'].predict(vectors)


[nltk_data] Downloading package stopwords to /home/aac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


array([1])