# 1) NLTK

In [23]:
from nltk.tokenize import sent_tokenize , word_tokenize
from nltk.corpus import stopwords
import string

In [24]:
sample_text = "Does This thing really work? Lets see."

### Step - 1 tokenize everything

In [25]:
sent_tokenize(sample_text)

['Does This thing really work?', 'Lets see.']

In [26]:
print(sample_text)
words = word_tokenize(sample_text.lower())
## tokenize word and convert all of the word into lower case 
## coz this is a stop word but This is not a stop word
## but it might or might not be a good idea coz let's 
## we have a name Abhishek then the algorithm only detemine this 
## word as a name if the first character is capital in Abhishek
words

Does This thing really work? Lets see.


['does', 'this', 'thing', 'really', 'work', '?', 'lets', 'see', '.']

### Step - 2 Get rid of stop words

In [5]:
stop = stopwords.words("english")
## add punctuations in stop list to get rid from them as well
punctuations = list(string.punctuation)
stop += punctuations
## all the stop are in lower case
stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [6]:
clean_words = [w for w in words if not w in stop]
clean_words

['thing', 'really', 'work', 'lets', 'see']

# 2) Stemming

In [7]:
from nltk.stem import PorterStemmer

In [8]:
stem_words = ["briefly" , "play" , "playing" , "player" , "played" , "happier" , "happying"]
ps = PorterStemmer()
stemmed_words = [ps.stem(w) for w in stem_words ]
## it is not very smart but do a descent job
## it has some set of rule which it follows to convert the word into root words
## even not always predict a valid english dictionary word
stemmed_words

['briefli', 'play', 'play', 'player', 'play', 'happier', 'happi']

# 3) Part of Speech

In [9]:
from nltk import pos_tag
from nltk.corpus import state_union

In [10]:
text = state_union.raw("2006-GWBush.txt")
text

'PRESIDENT GEORGE W. BUSH\'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all. Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream. Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King. (Applause.)\n\nPresident George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan. 31, 2006. White House photo by Eric DraperEvery time I\'m invited to this rostrum, I\'m humbled by the privilege, and mindful of the history we\'ve seen together. We have gathered under this Capitol dome in moments of national mourning and national achievement. We have serv

In [11]:
pos = pos_tag(word_tokenize(text))
pos = pos_tag(word_tokenize( "raj went for a walk."))
print(pos)
pos = pos_tag(word_tokenize("I have been painting since morning."))
print("\n",pos)

[('raj', 'NN'), ('went', 'VBD'), ('for', 'IN'), ('a', 'DT'), ('walk', 'NN'), ('.', '.')]

 [('I', 'PRP'), ('have', 'VBP'), ('been', 'VBN'), ('painting', 'VBG'), ('since', 'IN'), ('morning', 'NN'), ('.', '.')]


# 4) Lemmatization

In [12]:
from nltk.stem import WordNetLemmatizer

In [13]:
lemmatizer = WordNetLemmatizer()

In [14]:
lemmatizer.lemmatize("better" , pos = 'a')

'good'

In [15]:
lemmatizer.lemmatize("painting" , pos = 'n')

'painting'

In [16]:
lemmatizer.lemmatize("painting" , pos = 'v')

'paint'

In [17]:
lemmatizer.lemmatize("excellent" , pos = 'n')

'excellent'

In [18]:
w = "better"
# pos_tag(w) ## it took w as an array and find tag for each individual character
pos_tag([w])

[('better', 'RBR')]

In [19]:
## to find the pos tag that what we required for the lemmatization
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.ADV
    else :
        return wordnet.NOUN

# 5) Movie reviews Datasets

In [20]:
from nltk.corpus import movie_reviews
import random
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
import string
import nltk
from nltk import NaiveBayesClassifier

### 5-1) Loading.....

In [21]:
movie_reviews.categories()

['neg', 'pos']

In [22]:
print(len(movie_reviews.fileids()))
movie_reviews.fileids()

2000


['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [23]:
movie_reviews.fileids("pos")

['pos/cv000_29590.txt',
 'pos/cv001_18431.txt',
 'pos/cv002_15918.txt',
 'pos/cv003_11664.txt',
 'pos/cv004_11636.txt',
 'pos/cv005_29443.txt',
 'pos/cv006_15448.txt',
 'pos/cv007_4968.txt',
 'pos/cv008_29435.txt',
 'pos/cv009_29592.txt',
 'pos/cv010_29198.txt',
 'pos/cv011_12166.txt',
 'pos/cv012_29576.txt',
 'pos/cv013_10159.txt',
 'pos/cv014_13924.txt',
 'pos/cv015_29439.txt',
 'pos/cv016_4659.txt',
 'pos/cv017_22464.txt',
 'pos/cv018_20137.txt',
 'pos/cv019_14482.txt',
 'pos/cv020_8825.txt',
 'pos/cv021_15838.txt',
 'pos/cv022_12864.txt',
 'pos/cv023_12672.txt',
 'pos/cv024_6778.txt',
 'pos/cv025_3108.txt',
 'pos/cv026_29325.txt',
 'pos/cv027_25219.txt',
 'pos/cv028_26746.txt',
 'pos/cv029_18643.txt',
 'pos/cv030_21593.txt',
 'pos/cv031_18452.txt',
 'pos/cv032_22550.txt',
 'pos/cv033_24444.txt',
 'pos/cv034_29647.txt',
 'pos/cv035_3954.txt',
 'pos/cv036_16831.txt',
 'pos/cv037_18510.txt',
 'pos/cv038_9749.txt',
 'pos/cv039_6170.txt',
 'pos/cv040_8276.txt',
 'pos/cv041_21113.txt',
 

In [24]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

### 5-2) Cleaning

In [25]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))


print("withoutshuffle\n\n" , documents[0 : 5] , "\n")
random.shuffle(documents)
print("after_shuffle\n\n" , documents[0 : 5])

withoutshuffle

 [(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'), (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'), (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'), (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'), (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')] 

after_shuffle

 [(['capsule', ':', 'one', 'of', 'the', 'ten', 'worst', ...], 'neg'), (['usually', 'a', 'movie', 'is', 'about', 'something', ...], 'pos'), (['the', 'soldiers', 'of', 'three', 'kings', 'have', ...], 'pos'), (['"', 'through', 'a', 'spyglass', ',', 'i', 'could', ...], 'pos'), (['synopsis', ':', 'lifelong', 'friends', 'rafe', '(', ...], 'neg')]


In [26]:
lemmatizer = WordNetLemmatizer()

def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.ADV
    else :
        return wordnet.NOUN

stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)
stops ## set is to just sort the list and to take a word as feature only a single time

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [27]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words
# documents

In [28]:
documents = [(clean_review(document), category) for document, category in documents]

In [29]:
documents[0]

(['capsule',
  'one',
  'ten',
  'bad',
  'movies',
  'ever',
  'make',
  'christopher',
  'lambert',
  'vs',
  'evil',
  'ninjas',
  'modern',
  'day',
  'japan',
  'nobody',
  'wins',
  'hunt',
  'bad',
  'movie',
  'completely',
  'inept',
  'totally',
  'brain',
  'damage',
  'could',
  'almost',
  'feel',
  'affection',
  'could',
  'see',
  'show',
  'movie',
  'friends',
  'get',
  'good',
  'jolly',
  'guffaw',
  'also',
  'insanely',
  'xenophobic',
  'insult',
  'christopher',
  'lambert',
  'plays',
  'computer',
  'parts',
  'salesman',
  'business',
  'japan',
  'meets',
  'slinky',
  'young',
  'woman',
  'joan',
  'chen',
  'torrid',
  'night',
  'lovemaking',
  '--',
  'manages',
  'witness',
  'death',
  'hands',
  'evil',
  'ninja',
  'clan',
  'leader',
  'john',
  'lone',
  'apparently',
  'unfinished',
  'business',
  'could',
  'conclude',
  'get',
  'slaughter',
  'since',
  'lambert',
  'witness',
  'course',
  'next',
  'one',
  'die',
  'let',
  'stop',
  'thi

### 5-3) Split the data

In [30]:
training_documents = documents[0 : 1500]
testing_documents = documents[1500 : ]

### 5-4) Building Features Set

In [31]:
all_words = []
for doc in training_documents:
    all_words += doc[0]
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [i[0] for i in common]
features

['film',
 'one',
 'movie',
 'make',
 'like',
 'get',
 'go',
 'see',
 'even',
 'time',
 'good',
 'take',
 'story',
 'would',
 'much',
 'also',
 'bad',
 'come',
 'character',
 'two',
 'give',
 'well',
 'seem',
 'characters',
 'first',
 '--',
 'way',
 'end',
 'say',
 'really',
 'plot',
 'know',
 'life',
 'films',
 'little',
 'look',
 'people',
 'could',
 'man',
 'scene',
 'great',
 'work',
 'new',
 'love',
 'best',
 'scenes',
 'never',
 'u',
 'many',
 'director',
 'big',
 'movies',
 'want',
 'action',
 'watch',
 'another',
 'show',
 'something',
 'still',
 'world',
 'act',
 'back',
 'use',
 'try',
 'star',
 'old',
 'think',
 'however',
 'every',
 'real',
 'better',
 'though',
 'cast',
 'around',
 'audience',
 'may',
 'performance',
 'last',
 'role',
 'enough',
 'things',
 'years',
 'year',
 'interest',
 'long',
 'actually',
 'find',
 'write',
 'comedy',
 'young',
 'thing',
 'script',
 'play',
 'funny',
 'nothing',
 'john',
 'played',
 'almost',
 'fact',
 'right',
 'plays',
 'screen',
 'al

In [32]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [33]:
output = get_feature_dict(training_documents[0][0])
output

{'film': False,
 'one': True,
 'movie': True,
 'make': True,
 'like': True,
 'get': True,
 'go': False,
 'see': True,
 'even': True,
 'time': True,
 'good': True,
 'take': False,
 'story': False,
 'would': False,
 'much': False,
 'also': True,
 'bad': True,
 'come': False,
 'character': False,
 'two': True,
 'give': True,
 'well': False,
 'seem': False,
 'characters': True,
 'first': False,
 '--': True,
 'way': True,
 'end': True,
 'say': True,
 'really': False,
 'plot': True,
 'know': True,
 'life': True,
 'films': False,
 'little': True,
 'look': True,
 'people': False,
 'could': True,
 'man': False,
 'scene': True,
 'great': False,
 'work': False,
 'new': False,
 'love': False,
 'best': True,
 'scenes': False,
 'never': False,
 'u': False,
 'many': True,
 'director': False,
 'big': False,
 'movies': True,
 'want': False,
 'action': False,
 'watch': True,
 'another': False,
 'show': True,
 'something': False,
 'still': False,
 'world': False,
 'act': False,
 'back': False,
 'use': Fa

In [34]:
training_data = [(get_feature_dict(doc) , category) for doc , category in training_documents]
testing_data = [(get_feature_dict(doc) , category) for doc , category in training_documents]

In [35]:
training_data[0]

({'film': False,
  'one': True,
  'movie': True,
  'make': True,
  'like': True,
  'get': True,
  'go': False,
  'see': True,
  'even': True,
  'time': True,
  'good': True,
  'take': False,
  'story': False,
  'would': False,
  'much': False,
  'also': True,
  'bad': True,
  'come': False,
  'character': False,
  'two': True,
  'give': True,
  'well': False,
  'seem': False,
  'characters': True,
  'first': False,
  '--': True,
  'way': True,
  'end': True,
  'say': True,
  'really': False,
  'plot': True,
  'know': True,
  'life': True,
  'films': False,
  'little': True,
  'look': True,
  'people': False,
  'could': True,
  'man': False,
  'scene': True,
  'great': False,
  'work': False,
  'new': False,
  'love': False,
  'best': True,
  'scenes': False,
  'never': False,
  'u': False,
  'many': True,
  'director': False,
  'big': False,
  'movies': True,
  'want': False,
  'action': False,
  'watch': True,
  'another': False,
  'show': True,
  'something': False,
  'still': False,

### 5-5) Classification using NLTK Naive Bayes

In [36]:
clf = NaiveBayesClassifier.train(training_data)

In [37]:
nltk.classify.accuracy(clf , testing_data)

0.8706666666666667

In [38]:
clf.show_most_informative_features(15)

Most Informative Features
                  seagal = True              neg : pos    =     11.8 : 1.0
            breathtaking = True              pos : neg    =     11.2 : 1.0
                   inept = True              neg : pos    =     11.1 : 1.0
              schumacher = True              neg : pos    =      9.8 : 1.0
             outstanding = True              pos : neg    =      9.2 : 1.0
                     sat = True              neg : pos    =      7.7 : 1.0
                   mulan = True              pos : neg    =      6.9 : 1.0
                  insult = True              neg : pos    =      6.3 : 1.0
                 idiotic = True              neg : pos    =      6.2 : 1.0
             wonderfully = True              pos : neg    =      6.1 : 1.0
                    coen = True              pos : neg    =      6.1 : 1.0
                   inane = True              neg : pos    =      5.7 : 1.0
                   homer = True              pos : neg    =      5.6 : 1.0

### 5-6) Using Sklearn Classifier within NLTK

In [44]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier

In [51]:
## SVC
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)
classifier_sklearn.train(training_data)
nltk.classify.accuracy(classifier_sklearn , testing_data)



0.8346666666666667

In [50]:
## Random Forest
rfc = RandomForestClassifier()
classifier_sklearn1 = SklearnClassifier(rfc)
classifier_sklearn1.train(training_data)
nltk.classify.accuracy(classifier_sklearn1 , testing_data)



0.9926666666666667

### 5-7) Count Vectorizer ( convert the data into the format that sklearn required)

#### a) On Raw data

In [52]:
from sklearn.feature_extraction.text import CountVectorizer
####There is an option in count vectorizer(stop_words) which takes list of stop words and can do the work for us.

In [57]:
train_set = {"the sky sky is blue" , "the sun is bright"}
count_vec = CountVectorizer(max_features = 3)
a = count_vec.fit_transform(train_set)
print(a.todense())
a

[[1 2 1]
 [1 0 1]]


<2x3 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [65]:
count_vec.get_feature_names()

['is', 'sky', 'the']

In [66]:
a = ["abhishek" , "soni"]
" ".join(a)

'abhishek soni'

#### b) on movie datasets

In [63]:
documents[0]

(['capsule',
  'one',
  'ten',
  'bad',
  'movies',
  'ever',
  'make',
  'christopher',
  'lambert',
  'vs',
  'evil',
  'ninjas',
  'modern',
  'day',
  'japan',
  'nobody',
  'wins',
  'hunt',
  'bad',
  'movie',
  'completely',
  'inept',
  'totally',
  'brain',
  'damage',
  'could',
  'almost',
  'feel',
  'affection',
  'could',
  'see',
  'show',
  'movie',
  'friends',
  'get',
  'good',
  'jolly',
  'guffaw',
  'also',
  'insanely',
  'xenophobic',
  'insult',
  'christopher',
  'lambert',
  'plays',
  'computer',
  'parts',
  'salesman',
  'business',
  'japan',
  'meets',
  'slinky',
  'young',
  'woman',
  'joan',
  'chen',
  'torrid',
  'night',
  'lovemaking',
  '--',
  'manages',
  'witness',
  'death',
  'hands',
  'evil',
  'ninja',
  'clan',
  'leader',
  'john',
  'lone',
  'apparently',
  'unfinished',
  'business',
  'could',
  'conclude',
  'get',
  'slaughter',
  'since',
  'lambert',
  'witness',
  'course',
  'next',
  'one',
  'die',
  'let',
  'stop',
  'thi

In [73]:
categories = [category for document , category in documents]
categories

['neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',


In [74]:
text_documents = [" ".join(document) for document , category in documents]
text_documents

['capsule one ten bad movies ever make christopher lambert vs evil ninjas modern day japan nobody wins hunt bad movie completely inept totally brain damage could almost feel affection could see show movie friends get good jolly guffaw also insanely xenophobic insult christopher lambert plays computer parts salesman business japan meets slinky young woman joan chen torrid night lovemaking -- manages witness death hands evil ninja clan leader john lone apparently unfinished business could conclude get slaughter since lambert witness course next one die let stop think second lambert real life chase fanatical devotees ninja secret society lifespan could measure atomic clock movie ninja manage kill everyone except imagine japanese gods smile lambert provide goof field radiates ten feet body know goof field invisible zone anyone intent harm becomes klutz matter real dexterity course begin movie problems lambert eventually finds pseudo safety long haired modern day samurai yoshio harada partn

In [75]:
from sklearn.model_selection import train_test_split

In [87]:
x_train , x_test , y_train , y_test = train_test_split(text_documents , categories)

In [99]:
count_vec = CountVectorizer(max_features = 2000 , ngram_range = (1 , 2))
x_train_feature = count_vec.fit_transform(x_train)
x_train_feature.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 7, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [100]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '15',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '50',
 '54',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accomplish',
 'achieve',
 'across',
 'act',
 'action',
 'action film',
 'action scenes',
 'action sequences',
 'actions',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adam',
 'adapt',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adults',
 'adventure',
 'affair',
 'affleck',
 'age',
 'agent',
 'ago',
 'agree',
 'ahead',
 'air',
 'al',
 'alan',
 'alex',
 'alice',
 'alien',
 'aliens',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'along way',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'amy',
 'anderson',
 'angel',
 'angels',
 'angry',
 'animal',
 'animals',
 'animate',
 'animation',
 'anne',
 'annoy',
 'another',
 'answer',
 'anthony',
 'anti',
 'anyo

In [101]:
x_test_feature = count_vec.transform(x_test)
x_test_feature

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 84910 stored elements in Compressed Sparse Row format>

### 5-8) Sklearn Classifier

In [103]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_feature , y_train)
svc.score(x_test_feature , y_test)



0.792