# 1) NLTK

In [1]:
from nltk.tokenize import sent_tokenize , word_tokenize
from nltk.corpus import stopwords
import string

In [2]:
sample_text = "Does This thing really work? Lets see."

### Step - 1 tokenize everything

In [3]:
sent_tokenize(sample_text)

['Does This thing really work?', 'Lets see.']

In [4]:
print(sample_text)
words = word_tokenize(sample_text.lower())
## tokenize word and convert all of the word into lower case 
## because "this" is a stop word but "This" is not a stop word
## but it might or might not be a good idea coz let say,
## we have a name "Abhishek" then the algorithm only detemine this 
## word as a name if the first character is capital in "Abhishek"
words

Does This thing really work? Lets see.


['does', 'this', 'thing', 'really', 'work', '?', 'lets', 'see', '.']

### Step - 2 Get rid of stop words

In [5]:
stop = stopwords.words("english")
## add punctuations in stop list to get rid off, from them as well
punctuations = list(string.punctuation)
stop += punctuations
## all the stop are in lower case
stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [6]:
clean_words = [w for w in words if not w in stop]
clean_words

['thing', 'really', 'work', 'lets', 'see']

# 2) Stemming

In [7]:
from nltk.stem import PorterStemmer

In [8]:
stem_words = ["briefly", "play", "playing", "player", "played", "happier", "happying"]
ps = PorterStemmer()
stemmed_words = [ps.stem(w) for w in stem_words ]
## it is not very smart but do a descent job
## it has some set of rule which it follows to convert the word into root words
## even not always predict a valid english dictionary word
stemmed_words

['briefli', 'play', 'play', 'player', 'play', 'happier', 'happi']

# 3) Part of Speech

In [9]:
from nltk import pos_tag
from nltk.corpus import state_union

In [10]:
text = state_union.raw("2006-GWBush.txt")
text

'PRESIDENT GEORGE W. BUSH\'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all. Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream. Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King. (Applause.)\n\nPresident George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan. 31, 2006. White House photo by Eric DraperEvery time I\'m invited to this rostrum, I\'m humbled by the privilege, and mindful of the history we\'ve seen together. We have gathered under this Capitol dome in moments of national mourning and national achievement. We have serv

In [11]:
pos = pos_tag(word_tokenize(text))
print(pos)

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.'), ('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nati

In [12]:
## Some More Examples
pos = pos_tag(word_tokenize( "raj went for a walk."))
print(pos)
pos = pos_tag(word_tokenize("I have been painting since morning."))
print("\n",pos)

[('raj', 'NN'), ('went', 'VBD'), ('for', 'IN'), ('a', 'DT'), ('walk', 'NN'), ('.', '.')]

 [('I', 'PRP'), ('have', 'VBP'), ('been', 'VBN'), ('painting', 'VBG'), ('since', 'IN'), ('morning', 'NN'), ('.', '.')]


# 4) Lemmatization

In [13]:
from nltk.stem import WordNetLemmatizer

In [14]:
lemmatizer = WordNetLemmatizer()

In [15]:
lemmatizer.lemmatize("better" , pos = 'a')

'good'

In [16]:
lemmatizer.lemmatize("painting" , pos = 'n')

'painting'

In [17]:
lemmatizer.lemmatize("painting" , pos = 'v')

'paint'

In [18]:
lemmatizer.lemmatize("excellent" , pos = 'n')

'excellent'

In [19]:
w = "better"
## it takes w as an array and find tag for each individual character
pos_tag(w)

[('b', 'NN'), ('e', 'NN'), ('t', 'NN'), ('t', 'NN'), ('e', 'NN'), ('r', 'NN')]

In [20]:
pos_tag([w])

[('better', 'RBR')]

In [21]:
## to find the pos tag that what we required for the lemmatization
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.ADV
    else :
        return wordnet.NOUN

# 5) Movie reviews Datasets

In [22]:
from nltk.corpus import movie_reviews
import random
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
import string
import nltk
from nltk import NaiveBayesClassifier

### 5-1) Loading.....

In [23]:
movie_reviews.categories()

['neg', 'pos']

In [24]:
print(len(movie_reviews.fileids()))
movie_reviews.fileids()

2000


['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [25]:
movie_reviews.fileids("pos")

['pos/cv000_29590.txt',
 'pos/cv001_18431.txt',
 'pos/cv002_15918.txt',
 'pos/cv003_11664.txt',
 'pos/cv004_11636.txt',
 'pos/cv005_29443.txt',
 'pos/cv006_15448.txt',
 'pos/cv007_4968.txt',
 'pos/cv008_29435.txt',
 'pos/cv009_29592.txt',
 'pos/cv010_29198.txt',
 'pos/cv011_12166.txt',
 'pos/cv012_29576.txt',
 'pos/cv013_10159.txt',
 'pos/cv014_13924.txt',
 'pos/cv015_29439.txt',
 'pos/cv016_4659.txt',
 'pos/cv017_22464.txt',
 'pos/cv018_20137.txt',
 'pos/cv019_14482.txt',
 'pos/cv020_8825.txt',
 'pos/cv021_15838.txt',
 'pos/cv022_12864.txt',
 'pos/cv023_12672.txt',
 'pos/cv024_6778.txt',
 'pos/cv025_3108.txt',
 'pos/cv026_29325.txt',
 'pos/cv027_25219.txt',
 'pos/cv028_26746.txt',
 'pos/cv029_18643.txt',
 'pos/cv030_21593.txt',
 'pos/cv031_18452.txt',
 'pos/cv032_22550.txt',
 'pos/cv033_24444.txt',
 'pos/cv034_29647.txt',
 'pos/cv035_3954.txt',
 'pos/cv036_16831.txt',
 'pos/cv037_18510.txt',
 'pos/cv038_9749.txt',
 'pos/cv039_6170.txt',
 'pos/cv040_8276.txt',
 'pos/cv041_21113.txt',
 

In [26]:
print(movie_reviews.fileids()[5])
movie_reviews.words(movie_reviews.fileids()[5])

neg/cv005_29357.txt


['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

### 5-2) Cleaning

In [27]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))


print(type(documents[0]))
print("withoutshuffle\n\n" , documents[0 : 5] , "\n")
random.shuffle(documents)
print("after_shuffle\n\n" , documents[0 : 5])


<class 'tuple'>
withoutshuffle

 [(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'), (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'), (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'), (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'), (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')] 

after_shuffle

 [(['director', 'jan', 'de', 'bont', 'certainly', 'knows', ...], 'pos'), (['this', 'well', '-', 'conceived', 'but', 'ultra', ...], 'neg'), (['if', 'there', "'", 's', 'one', 'thing', 'i', 'just', ...], 'neg'), (['all', 'right', ',', 'all', 'right', ',', 'we', 'get', ...], 'neg'), (['the', 'happy', 'bastard', "'", 's', '30', '-', ...], 'pos')]


In [28]:
lemmatizer = WordNetLemmatizer()

def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.ADV
    else :
        return wordnet.NOUN
## set is to just sort the list and to take a word as feature only a single time
stops = set(stopwords.words('english'))
print(string.punctuation)
punctuations = list(string.punctuation)
stops.update(punctuations)
stops 

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [29]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [30]:
documents = [(clean_review(document), category) for document, category in documents]

In [31]:
documents[0]

(['director',
  'jan',
  'de',
  'bont',
  'certainly',
  'knows',
  'make',
  'top',
  'quality',
  'action',
  'films',
  'need',
  'look',
  'die',
  'hard',
  'director',
  'photography',
  'speed',
  'last',
  'years',
  'twister',
  'examples',
  'hi',
  'octane',
  'edge',
  'seat',
  'thrill',
  'prowess',
  'speed',
  '2',
  'exception',
  'time',
  'action',
  'take',
  'place',
  'huge',
  'cruise',
  'liner',
  'sandra',
  'bullock',
  'jason',
  'patrick',
  'try',
  'develop',
  'relationship',
  'yep',
  'right',
  'keanu',
  'reeves',
  'time',
  'fear',
  'jason',
  'patrick',
  'job',
  'speak',
  'job',
  'hand',
  'stop',
  'completely',
  'bonkers',
  'terrorist',
  'willem',
  'dafoe',
  'crash',
  'liner',
  'huge',
  'island',
  'steal',
  'diamond',
  'collection',
  'reason',
  'act',
  'destruction',
  'revenge',
  'years',
  'work',
  'company',
  'builds',
  'cruise',
  'liner',
  'computers',
  'contracts',
  'copper',
  'poison',
  'retire',
  'naturally'

### 5-3) Split the data

In [32]:
training_documents = documents[0 : 1500]
testing_documents = documents[1500 : ]

### 5-4) Building Features Set

In [33]:
all_words = []
for doc in training_documents:
    all_words += doc[0]
freq = nltk.FreqDist(all_words)
print(type(freq)) ## Frequency distribution objects
common = freq.most_common(3000)
features = [i[0] for i in common]
features

<class 'nltk.probability.FreqDist'>


['film',
 'one',
 'movie',
 'make',
 'like',
 'get',
 'go',
 'see',
 'even',
 'time',
 'good',
 'take',
 'would',
 'story',
 'much',
 'character',
 'bad',
 'come',
 'well',
 'also',
 'give',
 'characters',
 'two',
 'first',
 'seem',
 '--',
 'way',
 'plot',
 'say',
 'end',
 'know',
 'life',
 'really',
 'films',
 'little',
 'look',
 'could',
 'people',
 'man',
 'scene',
 'never',
 'work',
 'love',
 'great',
 'best',
 'new',
 'big',
 'scenes',
 'many',
 'u',
 'watch',
 'director',
 'want',
 'movies',
 'action',
 'another',
 'act',
 'show',
 'something',
 'back',
 'think',
 'world',
 'however',
 'still',
 'try',
 'use',
 'old',
 'though',
 'star',
 'every',
 'better',
 'real',
 'audience',
 'year',
 'enough',
 'last',
 'around',
 'young',
 'interest',
 'cast',
 'write',
 'performance',
 'role',
 'find',
 'years',
 'things',
 'actually',
 'john',
 'may',
 'funny',
 'script',
 'long',
 'play',
 'almost',
 'comedy',
 'thing',
 'played',
 'fact',
 'ever',
 'set',
 'although',
 'right',
 'scree

In [34]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [35]:
output = get_feature_dict(training_documents[0][0])
output

{'film': True,
 'one': True,
 'movie': True,
 'make': True,
 'like': False,
 'get': False,
 'go': True,
 'see': False,
 'even': False,
 'time': True,
 'good': True,
 'take': True,
 'would': False,
 'story': False,
 'much': False,
 'character': False,
 'bad': False,
 'come': False,
 'well': False,
 'also': True,
 'give': False,
 'characters': False,
 'two': False,
 'first': False,
 'seem': False,
 '--': False,
 'way': True,
 'plot': False,
 'say': False,
 'end': False,
 'know': False,
 'life': False,
 'really': False,
 'films': True,
 'little': False,
 'look': True,
 'could': False,
 'people': False,
 'man': False,
 'scene': False,
 'never': False,
 'work': True,
 'love': False,
 'great': True,
 'best': True,
 'new': False,
 'big': False,
 'scenes': False,
 'many': True,
 'u': False,
 'watch': False,
 'director': True,
 'want': False,
 'movies': False,
 'action': True,
 'another': False,
 'act': True,
 'show': False,
 'something': False,
 'back': True,
 'think': True,
 'world': False,
 

In [36]:
training_data = 
    [(get_feature_dict(doc), category) for doc, category in training_documents]
testing_data = 
    [(get_feature_dict(doc), category) for doc, category in training_documents]

In [37]:
training_data[0]

({'film': True,
  'one': True,
  'movie': True,
  'make': True,
  'like': False,
  'get': False,
  'go': True,
  'see': False,
  'even': False,
  'time': True,
  'good': True,
  'take': True,
  'would': False,
  'story': False,
  'much': False,
  'character': False,
  'bad': False,
  'come': False,
  'well': False,
  'also': True,
  'give': False,
  'characters': False,
  'two': False,
  'first': False,
  'seem': False,
  '--': False,
  'way': True,
  'plot': False,
  'say': False,
  'end': False,
  'know': False,
  'life': False,
  'really': False,
  'films': True,
  'little': False,
  'look': True,
  'could': False,
  'people': False,
  'man': False,
  'scene': False,
  'never': False,
  'work': True,
  'love': False,
  'great': True,
  'best': True,
  'new': False,
  'big': False,
  'scenes': False,
  'many': True,
  'u': False,
  'watch': False,
  'director': True,
  'want': False,
  'movies': False,
  'action': True,
  'another': False,
  'act': True,
  'show': False,
  'something

### 5-5) Classification using NLTK Naive Bayes

In [38]:
clf = NaiveBayesClassifier.train(training_data)

In [39]:
nltk.classify.accuracy(clf, testing_data)

0.888

In [40]:
clf.show_most_informative_features(15)

Most Informative Features
                  seagal = True              neg : pos    =     10.9 : 1.0
              schumacher = True              neg : pos    =      9.6 : 1.0
               ludicrous = True              neg : pos    =      8.9 : 1.0
                  dillon = True              pos : neg    =      8.4 : 1.0
                   jolie = True              neg : pos    =      8.2 : 1.0
             outstanding = True              pos : neg    =      8.0 : 1.0
                  turkey = True              neg : pos    =      7.2 : 1.0
              delightful = True              pos : neg    =      7.1 : 1.0
                   inept = True              neg : pos    =      6.9 : 1.0
                 refresh = True              pos : neg    =      6.4 : 1.0
                  prinze = True              neg : pos    =      6.1 : 1.0
                 freddie = True              neg : pos    =      6.1 : 1.0
                 idiotic = True              neg : pos    =      6.0 : 1.0

### 5-6) Using Sklearn Classifier within NLTK

In [41]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier

In [42]:
## SVC
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)
classifier_sklearn.train(training_data)
nltk.classify.accuracy(classifier_sklearn , testing_data)

0.9946666666666667

In [43]:
## Random Forest
rfc = RandomForestClassifier()
classifier_sklearn1 = SklearnClassifier(rfc)
classifier_sklearn1.train(training_data)
nltk.classify.accuracy(classifier_sklearn1 , testing_data)

1.0

### 5-7) Count Vectorizer ( convert the data into the format that sklearn required)

#### a) On Raw data

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
## There is an option in count vectorizer(stop_words) 
## which takes list of stop words and can do the work for us.

In [45]:
train_set = {"the sky sky is blue" , "the sun is bright"}
count_vec = CountVectorizer(max_features = 3)
a = count_vec.fit_transform(train_set)
print(a.todense())
a

[[1 0 1]
 [1 2 1]]


<2x3 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [46]:
count_vec.get_feature_names()

['is', 'sky', 'the']

In [47]:
a = ["pagal" , "and", "darpokk"]
" ".join(a)

'pagal and darpokk'

#### b) on movie datasets

In [48]:
documents[0]

(['director',
  'jan',
  'de',
  'bont',
  'certainly',
  'knows',
  'make',
  'top',
  'quality',
  'action',
  'films',
  'need',
  'look',
  'die',
  'hard',
  'director',
  'photography',
  'speed',
  'last',
  'years',
  'twister',
  'examples',
  'hi',
  'octane',
  'edge',
  'seat',
  'thrill',
  'prowess',
  'speed',
  '2',
  'exception',
  'time',
  'action',
  'take',
  'place',
  'huge',
  'cruise',
  'liner',
  'sandra',
  'bullock',
  'jason',
  'patrick',
  'try',
  'develop',
  'relationship',
  'yep',
  'right',
  'keanu',
  'reeves',
  'time',
  'fear',
  'jason',
  'patrick',
  'job',
  'speak',
  'job',
  'hand',
  'stop',
  'completely',
  'bonkers',
  'terrorist',
  'willem',
  'dafoe',
  'crash',
  'liner',
  'huge',
  'island',
  'steal',
  'diamond',
  'collection',
  'reason',
  'act',
  'destruction',
  'revenge',
  'years',
  'work',
  'company',
  'builds',
  'cruise',
  'liner',
  'computers',
  'contracts',
  'copper',
  'poison',
  'retire',
  'naturally'

In [49]:
categories = [category for document , category in documents]
categories

['pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',


In [50]:
text_documents = [" ".join(document) for document , category in documents]
text_documents

['director jan de bont certainly knows make top quality action films need look die hard director photography speed last years twister examples hi octane edge seat thrill prowess speed 2 exception time action take place huge cruise liner sandra bullock jason patrick try develop relationship yep right keanu reeves time fear jason patrick job speak job hand stop completely bonkers terrorist willem dafoe crash liner huge island steal diamond collection reason act destruction revenge years work company builds cruise liner computers contracts copper poison retire naturally entails next lot run jumping general high quality action jason patrick level best save day think back film help admire structure way willem dafoe initiates plan throws red herrings crew passengers race time save liner crash course finale chase scence although ruin top one many stunt explosion simple construction effective execution result clear concise highly enjoyable action film act adequate although darn obvious sandra 

In [51]:
from sklearn.model_selection import train_test_split

In [52]:
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [53]:
count_vec = CountVectorizer(max_features = 2000 , ngram_range = (1 , 2))
x_train_feature = count_vec.fit_transform(x_train)
x_train_feature.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [2, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [5, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [54]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '15',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '50',
 '54',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accomplish',
 'achieve',
 'across',
 'act',
 'action',
 'action film',
 'action movie',
 'action scenes',
 'action sequences',
 'actions',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adam',
 'adams',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adults',
 'adventure',
 'affair',
 'affleck',
 'age',
 'agent',
 'ago',
 'agree',
 'ahead',
 'aim',
 'air',
 'al',
 'alan',
 'alex',
 'alice',
 'alien',
 'aliens',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'along way',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'amy',
 'anderson',
 'andrew',
 'angels',
 'angry',
 'animal',
 'animals',
 'animate',
 'animation',
 'annie',
 'annoy',
 'another',
 '

In [55]:
x_test_feature = count_vec.transform(x_test)
x_test_feature

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 83377 stored elements in Compressed Sparse Row format>

### 5-8) Sklearn Classifier

In [56]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_feature , y_train)
svc.score(x_test_feature , y_test)

0.808