In [29]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.metrics import confusion_matrix
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [30]:
# Load dataset
dataset = pd.read_csv("datasets/IMDB-Dataset.csv", encoding='latin1')

In [31]:
# Dropping the duplicates in dataset
dataset = dataset.drop_duplicates(subset={"review","sentiment"}, keep='first', inplace=False)

In [32]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
def clean_text(review):
    # Convert to lowercase
    review = review.lower()

    # Remove HTML tags
    review = re.sub('<.*?>', '', review)

    # Remove punctuation
    review = re.sub(r'[^\w\s]', '', review)

    # Remove numbers
    review = re.sub(r'\d+', '', review)

    # Tokenization
    tokens = word_tokenize(review)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Remove extra whitespace
    text = ' '.join(tokens)

    return review

In [34]:
#stopwords.words('english')

In [35]:
# creating the document corpus
from tqdm import tqdm
corpus = []
for index, row in tqdm(dataset.iterrows()):
    review = clean_text(row['sentiment'])
    corpus.append(review)

100it [00:00, 1356.49it/s]


In [36]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
#Creating a tranform
cv = CountVectorizer(ngram_range=(1,3), max_features = 5000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1].values

In [38]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [39]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [40]:
# Predict the sentiment for new review
def predictNewReview():
    newReview = input("Type the Review: ")

    if newReview =='':
        print('Invalid Review')
    else:
        newReview = clean_text(newReview)
        print(newReview)
        new_review1 = cv.transform([newReview]).toarray()
        print(new_review1)
        prediction =  classifier.predict(new_review1)
        print(prediction)
        if prediction[0] == 1:
            print( "Positive Review" )
        else:
            print( "Negative Review")

In [41]:
predictNewReview()

Type the Review:  yes


yes
[[0 0]]
['negative']
Negative Review
