# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [6]:
dataset = pd.read_csv(r"C:\Users\hp\Downloads\archive\IMDB Dataset.csv")

In [8]:
print("Number of rows: ", dataset.shape[0])
print("Number of columns: ", dataset.shape[1])

Number of rows:  50000
Number of columns:  2


## Cleaning the Text

In [10]:
from bs4 import BeautifulSoup
cleantext = BeautifulSoup(dataset["review"][1], 'lxml').text
cleantext

'A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

In [11]:
import re
cleantext = re.sub(r'[^\w\s]', '', cleantext)
cleantext

'A wonderful little production The filming technique is very unassuming very oldtimeBBC fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece The actors are extremely well chosen Michael Sheen not only has got all the polari but he has all the voices down pat too You can truly see the seamless editing guided by the references to Williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece A masterful production about one of the great masters of comedy and his life The realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears It plays on our knowledge and our senses particularly with the scenes concerning Orton and Halliwell and the sets particularly of their flat with Halliwells murals decorating every surface are terribly well done'

In [21]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords.words('english')
token = cleantext.lower().split()
stopword = set(stopwords.words('english'))
token_list = [ word for word in token if word.lower() not in stopword ]
" ".join(token_list)
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer.lemmatize(" ".join(token_list))
dataset.keys()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...


Index(['review', 'sentiment'], dtype='object')

In [22]:
from tqdm import tqdm
def data_cleaner(dataset):
    clean_data = []
    for review in tqdm(dataset):
        cleantext = BeautifulSoup(review, "lxml").text
        cleantext = re.sub(r'[^\w\s]', '', cleantext)
        cleantext = [ token for token in cleantext.lower().split() if token not in stopword ]
        cleantext = lemmatizer.lemmatize(" ".join(cleantext))
        clean_data.append(cleantext.strip())
    return clean_data

In [24]:
clean_data = data_cleaner(dataset.review.values)
clean_data[0]

100%|██████████████████████████████████████████████████████████████████████████| 50000/50000 [00:12<00:00, 4092.58it/s]


'one reviewers mentioned watching 1 oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use wordit called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far awayi would say main appeal show due fact goes shows wouldnt dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards wholl sold nickel inmates wholl kill order get away well mannered middle 

## Splitting the dataset into the Training set and Test set

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset, dataset.sentiment, test_size = 0.20, random_state = 0, stratify=dataset.sentiment)

In [30]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
le_test = LabelEncoder()
y_test = le_test.fit_transform(y_test)

In [32]:
clean_data_train_data = data_cleaner(X_train.review.values)
X_train['cleaned_text'] = clean_data_train_data
clean_data_test_data = data_cleaner(X_test.review.values)
X_test['cleaned_text'] = clean_data_test_data

100%|██████████████████████████████████████████████████████████████████████████| 40000/40000 [00:09<00:00, 4214.47it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['cleaned_text'] = clean_data_train_data
100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [00:02<00:00, 3918.53it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['cleaned_text'] = clean_data_test_data


## Creating a bag of words model

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec = vec.fit(X_train.cleaned_text)
train_x_bow = vec.transform(X_train.cleaned_text)
test_x_bow = vec.transform(X_test.cleaned_text)

## Training the Naive Bayes model on the Training set

In [39]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
classifier = MultinomialNB()
alpha_ranges = {
    "alpha": [0.001, 0.01, 0.1, 1, 10.0, 100]
}
grid_search = GridSearchCV(classifier, param_grid=alpha_ranges, scoring='accuracy', cv=3, return_train_score=True)
grid_search.fit(train_x_bow, y_train)
grid_search.best_estimator_
classifier = MultinomialNB(alpha=1)
classifier.fit(train_x_bow, y_train)

MultinomialNB(alpha=1)

## Predicting the Test set results

In [40]:
y_pred = classifier.predict(test_x_bow)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [1 1]
 [1 1]
 ...
 [0 0]
 [0 0]
 [0 0]]


## Making the Confusion Matrix

In [41]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[4387  613]
 [ 774 4226]]


0.8613

In [42]:
from sklearn.metrics import classification_report
print("Accuracy is ", classification_report(y_test, y_pred))

Accuracy is                precision    recall  f1-score   support

           0       0.85      0.88      0.86      5000
           1       0.87      0.85      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



## Testing on Random Data

In [45]:
text = ["I liked the movie. It was great"]
text_vec = vec.transform(text)
classifier.predict(text_vec)

array([1])

In [46]:
le.inverse_transform([1])

array(['positive'], dtype=object)

In [47]:
text = ["Movie was worse"]
text_vec = vec.transform(text)
classifier.predict(text_vec)

array([0])

In [48]:
le.inverse_transform([0])

array(['negative'], dtype=object)