In [2]:
from nltk.corpus import movie_reviews

In [3]:
movie_reviews.categories()

['neg', 'pos']

In [4]:
movie_reviews.fileids('pos')

['pos/cv000_29590.txt',
 'pos/cv001_18431.txt',
 'pos/cv002_15918.txt',
 'pos/cv003_11664.txt',
 'pos/cv004_11636.txt',
 'pos/cv005_29443.txt',
 'pos/cv006_15448.txt',
 'pos/cv007_4968.txt',
 'pos/cv008_29435.txt',
 'pos/cv009_29592.txt',
 'pos/cv010_29198.txt',
 'pos/cv011_12166.txt',
 'pos/cv012_29576.txt',
 'pos/cv013_10159.txt',
 'pos/cv014_13924.txt',
 'pos/cv015_29439.txt',
 'pos/cv016_4659.txt',
 'pos/cv017_22464.txt',
 'pos/cv018_20137.txt',
 'pos/cv019_14482.txt',
 'pos/cv020_8825.txt',
 'pos/cv021_15838.txt',
 'pos/cv022_12864.txt',
 'pos/cv023_12672.txt',
 'pos/cv024_6778.txt',
 'pos/cv025_3108.txt',
 'pos/cv026_29325.txt',
 'pos/cv027_25219.txt',
 'pos/cv028_26746.txt',
 'pos/cv029_18643.txt',
 'pos/cv030_21593.txt',
 'pos/cv031_18452.txt',
 'pos/cv032_22550.txt',
 'pos/cv033_24444.txt',
 'pos/cv034_29647.txt',
 'pos/cv035_3954.txt',
 'pos/cv036_16831.txt',
 'pos/cv037_18510.txt',
 'pos/cv038_9749.txt',
 'pos/cv039_6170.txt',
 'pos/cv040_8276.txt',
 'pos/cv041_21113.txt',
 

In [5]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [6]:
# movie_reviews.fileids()[5]

# Data Cleaning

In [7]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents[0:5]
        

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [8]:
import random
random.shuffle(documents)
documents[0:5]

[(['"', 'we', 'are', 'grateful', 'that', 'we', 'have', ...], 'neg'),
 (['this', 'has', 'been', 'an', 'extraordinary', 'year', ...], 'pos'),
 (['pulp', 'fiction', ',', 'quentin', 'tarantino', "'", ...], 'pos'),
 (['ironically', ',', 'one', 'of', 'the', 'themes', 'of', ...], 'neg'),
 (['sometimes', 'i', 'find', '19th', 'century', ...], 'pos')]

## Removing StopWords and lemmatizing

In [9]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [10]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [11]:
from nltk import pos_tag


In [12]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)
stops

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [13]:
def clean_review(words):
    output_words=[]
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [14]:
documents = [(clean_review(document),category) for document,category in documents]

In [15]:
documents[0]

(['grateful',
  'song',
  'grace',
  'chan',
  'comfort',
  'u',
  'star',
  'lee',
  'kang',
  'sheng',
  'yang',
  'kuei',
  'mei',
  'direct',
  'tsai',
  'ming',
  'liang',
  'write',
  'tsai',
  'yang',
  'ping',
  'ying',
  'cinematography',
  'liao',
  'peng',
  'jung',
  'taiwan',
  '24',
  '12',
  '99',
  'millenium',
  'approach',
  'incessant',
  'downpour',
  'batter',
  'unnamed',
  'city',
  'sector',
  'city',
  'seal',
  'quarantine',
  'due',
  'onset',
  'mysterious',
  'virus',
  'virus',
  'cause',
  'people',
  'act',
  'like',
  'insect',
  'crawl',
  'four',
  'hide',
  'bright',
  'light',
  'huddle',
  'damp',
  'corner',
  'water',
  'supply',
  'quarantine',
  'zone',
  'cut',
  'week',
  'time',
  'resident',
  'advise',
  'evacuate',
  'area',
  'asap',
  'premise',
  '--',
  'kafka',
  'way',
  'cronenberg',
  '--',
  'background',
  'hole',
  'could',
  'great',
  'movie',
  'sadly',
  'end',
  'waste',
  'good',
  'idea',
  'quick',
  'sketch',
  'sugges

In [16]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]


In [17]:
all_words=[]
for doc in training_documents:
    all_words+=doc[0]

In [18]:
import nltk

In [19]:
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [i[0] for i in common]


In [20]:
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'even',
 'scene',
 'good',
 'story',
 'take',
 'would',
 'much',
 'also',
 'come',
 'bad',
 'give',
 'life',
 '--',
 'two',
 'look',
 'end',
 'seem',
 'know',
 'way',
 'year',
 'first',
 'work',
 'thing',
 'plot',
 'play',
 'say',
 'really',
 'show',
 'little',
 'people',
 'could',
 'man',
 'star',
 'love',
 'never',
 'director',
 'best',
 'new',
 'performance',
 'big',
 'great',
 'try',
 'actor',
 'action',
 'many',
 'want',
 'watch',
 'u',
 'role',
 'find',
 'think',
 'another',
 'act',
 'world',
 'something',
 'audience',
 'turn',
 'still',
 'back',
 'day',
 'however',
 'old',
 'set',
 'begin',
 'use',
 'feel',
 'guy',
 'comedy',
 'though',
 'enough',
 'every',
 'cast',
 'real',
 'part',
 'last',
 'interest',
 'around',
 'point',
 'write',
 'fact',
 'may',
 'run',
 'script',
 'lot',
 'place',
 'name',
 'young',
 'long',
 'funny',
 'minute',
 'woman',
 'almost',
 'nothing',
 'actually

In [21]:
def get_feature_dict(words):
    current_features={}
    words_set = set(words)
    
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [22]:
training_data = [(get_feature_dict(doc),category) for doc,category in training_documents]
testing_data = [(get_feature_dict(doc),category) for doc,category in testing_documents]

In [23]:
# get_feature_dict(training_documents[0][0])

In [24]:
training_data[0]



({'film': True,
  'movie': True,
  'one': True,
  'make': True,
  'like': True,
  'character': True,
  'get': False,
  'see': True,
  'go': False,
  'time': True,
  'well': True,
  'even': True,
  'scene': True,
  'good': True,
  'story': False,
  'take': True,
  'would': False,
  'much': True,
  'also': False,
  'come': False,
  'bad': False,
  'give': False,
  'life': True,
  '--': True,
  'two': False,
  'look': True,
  'end': True,
  'seem': True,
  'know': False,
  'way': True,
  'year': False,
  'first': False,
  'work': False,
  'thing': False,
  'plot': False,
  'play': False,
  'say': True,
  'really': False,
  'show': False,
  'little': False,
  'people': True,
  'could': True,
  'man': True,
  'star': True,
  'love': False,
  'never': False,
  'director': True,
  'best': True,
  'new': False,
  'performance': False,
  'big': False,
  'great': True,
  'try': True,
  'actor': False,
  'action': False,
  'many': False,
  'want': True,
  'watch': True,
  'u': True,
  'role': Fal

In [25]:
from nltk import NaiveBayesClassifier

In [26]:
clf = NaiveBayesClassifier.train(training_data)

In [27]:
nltk.classify.accuracy(clf,testing_data)

0.802

In [28]:
clf.show_most_informative_features(15)

Most Informative Features
               stupidity = True              neg : pos    =     11.3 : 1.0
             outstanding = True              pos : neg    =     10.6 : 1.0
               ludicrous = True              neg : pos    =      9.2 : 1.0
                religion = True              pos : neg    =      8.7 : 1.0
              henstridge = True              neg : pos    =      8.3 : 1.0
                 idiotic = True              neg : pos    =      8.1 : 1.0
                  castle = True              pos : neg    =      7.5 : 1.0
                   jolie = True              neg : pos    =      6.9 : 1.0
              schumacher = True              neg : pos    =      6.5 : 1.0
                 luckily = True              pos : neg    =      6.2 : 1.0
             wonderfully = True              pos : neg    =      6.1 : 1.0
                    lame = True              neg : pos    =      6.0 : 1.0
                   ideal = True              pos : neg    =      6.0 : 1.0

# Using SK Learn Classifier Within NLTK

## SVM Within NLTK

In [66]:
from sklearn.svm import SVC

In [67]:
from nltk.classify.scikitlearn import SklearnClassifier

In [68]:
svc = SVC()
clf_sklearn = SklearnClassifier(svc)

In [69]:
clf_sklearn.train(training_data)

<SklearnClassifier(SVC())>

In [70]:
nltk.classify.accuracy(clf_sklearn,testing_data)

0.842

## RandomForest Within NLTK

In [71]:
from sklearn.ensemble import RandomForestClassifier

In [72]:
rfc = RandomForestClassifier()
classifier_sklearn1 = SklearnClassifier(rfc)

In [73]:
classifier_sklearn1.train(training_data)

<SklearnClassifier(RandomForestClassifier())>

In [74]:
nltk.classify.accuracy(classifier_sklearn1,testing_data)

0.804

## Logisitic Regression Within NLTK

In [50]:
from sklearn.linear_model import LogisticRegression

In [75]:
lgc = LogisticRegression()

In [76]:
classifier_nltkLg = SklearnClassifier(lgc) 

In [77]:
classifier_nltkLg.train(training_data)

<SklearnClassifier(LogisticRegression())>

In [79]:
nltk.classify.accuracy(classifier_nltkLg,testing_data)

0.836

# Count Vectorizer 

## Converting the data in format required by Sklearn and using nltk for just data cleaning

In [80]:
from sklearn.feature_extraction.text import CountVectorizer

In [96]:
train_set = {"the sky is blue","the sun is burning"}
count_vec = CountVectorizer(max_features=3)
a = count_vec.fit_transform(train_set)
print(a)     # compressed form
a.todense()  # original form

  (0, 2)	1
  (0, 1)	1
  (0, 0)	1
  (1, 2)	1
  (1, 1)	1


matrix([[1, 1, 1],
        [0, 1, 1]])

In [97]:
count_vec.get_feature_names_out()

array(['blue', 'is', 'the'], dtype=object)

In [111]:
categories  = [category for document,category in documents ]

In [112]:
categories

['neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',


In [113]:
text_documents = [" ".join(document) for document, category in documents]

In [119]:
# text_documents[0]
from sklearn.model_selection import train_test_split


In [120]:
x_train,x_test,y_trin,y_test = train_test_split(text_documents,categories)

In [127]:
count_vec = CountVectorizer(max_features=2000)
x_train_features = count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 1, 0],
        ...,
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 1, 0]])

In [131]:
count_vec.get_feature_names_out()

array(['000', '10', '100', ..., 'york', 'young', 'zero'], dtype=object)

In [132]:
x_test_features = count_vec.transform(x_test)
x_test_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 2, 0],
        ...,
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 1, 0]])