# Downloading Important Libraries

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('state_union')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('maxent_treebank_pos_tagger')

In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer

In [None]:
import pandas as pd

# Importing Dataset into a pandas dataframe

In [None]:
train_document = pd.read_csv("/content/movie.csv")

# Creating a list of stop words

In [None]:
import string
stop_words=stopwords.words('english')
punct=list(string.punctuation)
stop_words+=punct

#Making a list containing tuples with reviews and their corresponding sentiment 

In [None]:
documents = [(train_document.iloc[i,0],train_document.iloc[i,1]) for i in range(train_document.shape[0])]
documents[0]

('I grew up (b. 1965) watching and loving the Thunderbirds. All my mates at school watched. We played "Thunderbirds" before school, during lunch and after school. We all wanted to be Virgil or Scott. No one wanted to be Alan. Counting down from 5 became an art form. I took my children to see the movie hoping they would get a glimpse of what I loved as a child. How bitterly disappointing. The only high point was the snappy theme tune. Not that it could compare with the original score of the Thunderbirds. Thankfully early Saturday mornings one television channel still plays reruns of the series Gerry Anderson and his wife created. Jonatha Frakes should hand in his directors chair, his version was completely hopeless. A waste of film. Utter rubbish. A CGI remake may be acceptable but replacing marionettes with Homo sapiens subsp. sapiens was a huge error of judgment.',
 0)

# Converting each review into list of words 

In [None]:
doc = [(word_tokenize(word),category) for word,category in documents]

In [None]:
doc[:5]

In [None]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

#Importing lemmatizer and cleaning list of words by lemmetizing and removing stop words

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [None]:
from nltk import pos_tag
def clean_review(words):
    output_words=[]
    for w in words:
        if w.lower() not in stop_words:
            pos=pos_tag([w])
            clean_word=lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [None]:
docum=[(clean_review(words),category) for words,category in doc]

#Splitting dataset for training and testing

In [None]:
import random
random.shuffle(docum)

In [None]:
n=len(docum)

In [None]:
n

40000

In [None]:
n = int(.75*n)

In [None]:
training_documents=docum[:n]
testing_documents=docum[n:]

#Collecting unique words from all reviews in training dataset and picking 5000 most frequently occuring words 

In [None]:
all_words= []
for words in training_documents:
  all_words+=words[0]  

In [None]:
len(all_words)

3915364

In [None]:
freq=nltk.FreqDist(all_words)
common=freq.most_common(5000)
features=[i[0] for i in common]
features[0:10]


['br', "'s", 'movie', 'film', "''", "n't", '``', 'one', 'like', 'make']

In [None]:
len(freq)

117739

# Making a feature dictionary for each review

In [None]:
def get_feature_dict(words):
  dict = {}
  words_set = set(words)
  for curr_feat in features:
    dict[curr_feat] = curr_feat in words_set
  return dict

In [None]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_documents]
training_data[:5]

In [None]:
testing_data = [(get_feature_dict(doc), category) for doc, category in testing_documents]

#Model Training and Testing

In [None]:
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(training_data)

In [None]:
nltk.classify.accuracy(classifier, testing_data)

0.8519