### BASICS OF WORD PROCESSING

In [2]:
sample_text = "The Faculty assigned to my branch for CAO subject is not well practiced. Help me!"

In [3]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [4]:
sent_tokenize(sample_text)

['The Faculty assigned to my branch for CAO subject is not well practiced.',
 'Help me!']

In [5]:
words = word_tokenize(sample_text.lower())
words

['the',
 'faculty',
 'assigned',
 'to',
 'my',
 'branch',
 'for',
 'cao',
 'subject',
 'is',
 'not',
 'well',
 'practiced',
 '.',
 'help',
 'me',
 '!']

In [1]:
from nltk.corpus import stopwords
import string 
stop = stopwords.words('english')
punct = list(string.punctuation)
stop = stop + punct

In [20]:
clean = [w for w in words if w not in stop]
clean

['faculty',
 'assigned',
 'branch',
 'cao',
 'subject',
 'well',
 'practiced',
 'help']

### STEMMING

In [21]:
from nltk.stem import PorterStemmer

In [25]:
stem_words = ["playing" , "player", "played" , "play" , "happying", "briefly"]
ps = PorterStemmer()
words = [ps.stem(w) for w in stem_words]
words

['play', 'player', 'play', 'play', 'happi', 'briefli']

### PART OF SPEECH

In [2]:
from nltk import pos_tag
from nltk.corpus import state_union

In [34]:
text = state_union.raw('2006-GWBush.txt')

In [36]:
pos = pos_tag(word_tokenize(text))
pos

[('PRESIDENT', 'NNP'),
 ('GEORGE', 'NNP'),
 ('W.', 'NNP'),
 ('BUSH', 'NNP'),
 ("'S", 'POS'),
 ('ADDRESS', 'NNP'),
 ('BEFORE', 'IN'),
 ('A', 'NNP'),
 ('JOINT', 'NNP'),
 ('SESSION', 'NNP'),
 ('OF', 'IN'),
 ('THE', 'NNP'),
 ('CONGRESS', 'NNP'),
 ('ON', 'NNP'),
 ('THE', 'NNP'),
 ('STATE', 'NNP'),
 ('OF', 'IN'),
 ('THE', 'NNP'),
 ('UNION', 'NNP'),
 ('January', 'NNP'),
 ('31', 'CD'),
 (',', ','),
 ('2006', 'CD'),
 ('THE', 'NNP'),
 ('PRESIDENT', 'NNP'),
 (':', ':'),
 ('Thank', 'NNP'),
 ('you', 'PRP'),
 ('all', 'DT'),
 ('.', '.'),
 ('Mr.', 'NNP'),
 ('Speaker', 'NNP'),
 (',', ','),
 ('Vice', 'NNP'),
 ('President', 'NNP'),
 ('Cheney', 'NNP'),
 (',', ','),
 ('members', 'NNS'),
 ('of', 'IN'),
 ('Congress', 'NNP'),
 (',', ','),
 ('members', 'NNS'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('Supreme', 'NNP'),
 ('Court', 'NNP'),
 ('and', 'CC'),
 ('diplomatic', 'JJ'),
 ('corps', 'NN'),
 (',', ','),
 ('distinguished', 'JJ'),
 ('guests', 'NNS'),
 (',', ','),
 ('and', 'CC'),
 ('fellow', 'JJ'),
 ('citizens', 'NNS'

In [40]:
pos = pos_tag(word_tokenize("I have been painting since morning."))
pos

[('I', 'PRP'),
 ('have', 'VBP'),
 ('been', 'VBN'),
 ('painting', 'VBG'),
 ('since', 'IN'),
 ('morning', 'NN'),
 ('.', '.')]

### CC coordinating conjunction
### CD cardinal digit
### DT determiner
### EX existential there (like: “there is” … think of it like “there exists”)
### FW foreign word
### IN preposition/subordinating conjunction
### JJ adjective ‘big’
### JJR adjective, comparative ‘bigger’
### JJS adjective, superlative ‘biggest’
### LS list marker 1)
### MD modal could, will
### NN noun, singular ‘desk’
### NNS noun plural ‘desks’
### NNP proper noun, singular ‘Harrison’
### NNPS proper noun, plural ‘Americans’
### PDT predeterminer ‘all the kids’
### POS possessive ending parent’s
### PRP personal pronoun I, he, she
### PRPdollar possessive pronoun my, his, hers
### RB adverb very, silently,
### RBR adverb, comparative better
### RBS adverb, superlative best
### RP particle give up
### TO, to go ‘to’ the store.
### UH interjection, errrrrrrrm
### VB verb, base form take
### VBD verb, past tense took
### VBG verb, gerund/present participle taking
### VBN verb, past participle taken
### VBP verb, sing. present, non-3d take
### VBZ verb, 3rd person sing. present takes
### WDT wh-determiner which
### WP wh-pronoun who, what
### WPdollar possessive wh-pronoun whose
### WRB wh-abverb where, when

### LEMATIZATION

In [3]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [43]:
lemmatizer.lemmatize("good", pos='a')

'good'

In [46]:
lemmatizer.lemmatize("better", pos='a')

'good'

In [13]:
from nltk.corpus import wordnet
def get_simple_pos_tag(tag):
    if tag[0]=='J':
        return wordnet.ADJ
    elif tag[0]=='V':
        return wordnet.VERB
    elif tag[0]=='N':
        return wordnet.NOUN
    elif tag[0]=='R':
        return wordnet.ADV
    else:
        return wordnet.NOUN

### MOVIE REVIEW DATASET

In [6]:
from nltk.corpus import movie_reviews

In [7]:
from nltk.corpus import stopwords
from nltk import pos_tag
import string 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
punct = list(string.punctuation)
stop = stop + punct

In [8]:
print(movie_reviews.categories())
print(len(movie_reviews.fileids()))
print(len(movie_reviews.fileids('neg')))
print(movie_reviews.words(movie_reviews.fileids()[0]))

['neg', 'pos']
2000
1000
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]


In [9]:
documents = []
for category in movie_reviews.categories():
    for doc in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(doc),category))
documents[:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [10]:
import random
random.shuffle(documents)
documents[:5]

[(['ever', 'watch', 'a', 'very', 'young', 'child', 'try', ...], 'neg'),
 (['the', 'scene', 'at', 'the', 'end', 'of', '1989', "'", ...], 'neg'),
 (['at', 'the', 'outset', 'of', 'swordfish', ',', 'john', ...], 'neg'),
 (['synopsis', ':', 'private', 'detective', 'tom', ...], 'pos'),
 (['although', 'i', 'had', 'not', 'been', 'a', 'viewer', ...], 'neg')]

In [11]:
def clean_reviews(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            ps = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w,pos=get_simple_pos_tag(ps[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [14]:
documents = [(clean_reviews(doc),category) for doc,category in documents]

In [15]:
training_documents = documents[:1500] 
testing_documents = documents[1500:]

In [16]:
total_words = []
for doc_words,cat in training_documents:
    total_words+=doc_words

In [19]:
total_words

['ever',
 'watch',
 'young',
 'child',
 'try',
 'tell',
 'joke',
 'beyond',
 'sophistication',
 'full',
 'stop',
 'start',
 'usually',
 'punch',
 'line',
 'ruin',
 'felt',
 'way',
 'watch',
 'drown',
 'mona',
 'skip',
 'stone',
 'across',
 'water',
 'approximate',
 'depth',
 'attempt',
 'ensemble',
 'comedy',
 'slightly',
 'successful',
 'attempt',
 'humor',
 'serve',
 'window',
 'dress',
 'run',
 'gag',
 'town',
 'verplanck',
 'n',
 'test',
 'town',
 'new',
 'yugo',
 'everyone',
 'drive',
 'yugo',
 'differentiate',
 'car',
 'personalize',
 'license',
 'plate',
 'even',
 'police',
 'chief',
 'drive',
 'one',
 'certain',
 'sublimeness',
 'image',
 'light',
 'siren',
 'deck',
 'yugo',
 'skitter',
 'way',
 'town',
 'street',
 'also',
 'run',
 'gag',
 'one',
 'character',
 'jeff',
 'miss',
 'hand',
 'occur',
 'poke',
 'gentle',
 'fun',
 'notion',
 'urban',
 'legend',
 'turn',
 'truth',
 'horrify',
 'legend',
 'sight',
 'gag',
 'pop',
 'well',
 'mona',
 'tombstone',
 'read',
 'demote',
 'in

In [20]:
import nltk
freq = nltk.FreqDist(total_words)
top_feat = freq.most_common(3000)
feature_set = [ft for ft,fr in top_feat]

In [23]:
top_feat

[('film', 8365),
 ('movie', 5211),
 ('one', 4518),
 ('make', 3222),
 ('like', 3004),
 ('character', 2905),
 ('get', 2779),
 ('see', 2333),
 ('go', 2270),
 ('time', 2180),
 ('well', 2138),
 ('scene', 2011),
 ('even', 1951),
 ('good', 1854),
 ('story', 1730),
 ('take', 1635),
 ('would', 1588),
 ('much', 1537),
 ('come', 1474),
 ('two', 1469),
 ('also', 1462),
 ('give', 1456),
 ('life', 1439),
 ('bad', 1437),
 ('way', 1411),
 ('seem', 1411),
 ('look', 1400),
 ('first', 1396),
 ('--', 1385),
 ('end', 1381),
 ('know', 1374),
 ('year', 1315),
 ('work', 1272),
 ('thing', 1239),
 ('plot', 1193),
 ('say', 1186),
 ('really', 1174),
 ('play', 1153),
 ('little', 1138),
 ('show', 1100),
 ('people', 1095),
 ('could', 1094),
 ('man', 1090),
 ('star', 1050),
 ('never', 1037),
 ('try', 1021),
 ('great', 1015),
 ('new', 1014),
 ('performance', 996),
 ('director', 991),
 ('best', 990),
 ('love', 982),
 ('big', 968),
 ('action', 963),
 ('want', 952),
 ('many', 936),
 ('actor', 934),
 ('find', 918),
 ('wat

In [25]:
def get_freq_dict(words):
    word = set(words)
    features={}
    for w in feature_set:
        features[w] = w in word
    return features

In [26]:
training_data = [(get_freq_dict(doc),cat) for doc,cat in training_documents]
testing_data = [(get_freq_dict(doc),cat) for doc,cat in testing_documents]

In [30]:
training_data[:2]

[({'film': True,
   'movie': True,
   'one': True,
   'make': True,
   'like': True,
   'character': True,
   'get': False,
   'see': False,
   'go': False,
   'time': True,
   'well': True,
   'scene': True,
   'even': True,
   'good': False,
   'story': True,
   'take': False,
   'would': False,
   'much': False,
   'come': True,
   'two': False,
   'also': True,
   'give': True,
   'life': False,
   'bad': False,
   'way': True,
   'seem': False,
   'look': False,
   'first': False,
   '--': False,
   'end': False,
   'know': False,
   'year': False,
   'work': False,
   'thing': True,
   'plot': False,
   'say': False,
   'really': False,
   'play': False,
   'little': False,
   'show': False,
   'people': True,
   'could': True,
   'man': False,
   'star': False,
   'never': False,
   'try': True,
   'great': False,
   'new': True,
   'performance': False,
   'director': False,
   'best': False,
   'love': True,
   'big': False,
   'action': False,
   'want': False,
   'many': Fal

In [30]:
from nltk import NaiveBayesClassifier as nbc
clf = nbc.train(training_data)

In [31]:
nltk.classify.accuracy(clf,testing_data)

0.792

In [33]:
clf.show_most_informative_features(15)

Most Informative Features
             outstanding = True              pos : neg    =     13.2 : 1.0
               ludicrous = True              neg : pos    =     12.7 : 1.0
                 idiotic = True              neg : pos    =      9.9 : 1.0
                 balance = True              pos : neg    =      6.8 : 1.0
                   anger = True              pos : neg    =      6.3 : 1.0
             wonderfully = True              pos : neg    =      6.0 : 1.0
                   idiot = True              neg : pos    =      5.9 : 1.0
                     era = True              pos : neg    =      5.9 : 1.0
                  seagal = True              neg : pos    =      5.8 : 1.0
              schumacher = True              neg : pos    =      5.8 : 1.0
                   damon = True              pos : neg    =      5.8 : 1.0
                   inept = True              neg : pos    =      5.6 : 1.0
                  turkey = True              neg : pos    =      5.5 : 1.0

### USING SKLEARN CLASSIFIERS WITH NLTK

In [39]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier

In [37]:
svm = SVC()
clf = SklearnClassifier(svm)

In [38]:
clf.train(training_data)
nltk.classify.accuracy(clf,testing_data)



0.756

In [40]:
rfc = RandomForestClassifier()
clf = SklearnClassifier(rfc)

In [41]:
clf.train(training_data)
nltk.classify.accuracy(clf,testing_data)



0.678

### COUNT VECTORIZER

In [31]:
 from sklearn.feature_extraction.text import CountVectorizer

In [32]:
categories = [cat for doc,cat in documents]
text_doc = [" ".join(doc) for doc,cat in documents]

In [34]:
text_doc[:2]

['ever watch young child try tell joke beyond sophistication full stop start usually punch line ruin felt way watch drown mona skip stone across water approximate depth attempt ensemble comedy slightly successful attempt humor serve window dress run gag town verplanck n test town new yugo everyone drive yugo differentiate car personalize license plate even police chief drive one certain sublimeness image light siren deck yugo skitter way town street also run gag one character jeff miss hand occur poke gentle fun notion urban legend turn truth horrify legend sight gag pop well mona tombstone read demote instead devote still core story maintains flatness still pond lynchpin film fall redundant sporadic scene involve mona midler scene scream someone hit someone scream uniqueness modulation explanation meanness anyone street could played role character walk cardboard cutout one define one trait attempt make deeper characterization burly female car mechanic course force lesbianism gag mona 

In [44]:
from sklearn.model_selection import train_test_split

In [45]:
x_train, x_test, y_train, y_test = train_test_split(text_doc,categories)

In [72]:
count_vec = CountVectorizer(max_features = 2000,stop_words=stop,ngram_range=(1,2))
x_train_features = count_vec.fit_transform(x_train,y_train)

In [73]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '15',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '2001',
 '30',
 '50',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accomplish',
 'achieve',
 'across',
 'act',
 'action',
 'action film',
 'action movie',
 'action scene',
 'action sequence',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'ahead',
 'aid',
 'aim',
 'air',
 'al',
 'ala',
 'alan',
 'alex',
 'alice',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'along way',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'amy',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animate',
 'animation',
 'anne',
 'annie',
 'annoy',
 'another',
 'answer',


In [74]:
x_test_features = count_vec.transform(x_test)
x_test_features.todense() # todense() is used for displaying the sparse matrix 

matrix([[0, 0, 1, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [75]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_features,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [76]:
svc.score(x_test_features,y_test)

0.822