## Using NLTK for text classification - Akash Gupta

This note demonstrates various Python functions and primarily the **NLTK** library to carry out text classification tasks.

In [1]:
import nltk
import re
import pprint

# inputs are converted into feature sets which are then fed to the ML algorithm 
# along with training labels. Then for testing, again inputs are converted into 
# extracted feature sets which are fed to the model to get the labels.

In [2]:
# modeling differences in gender names. Decide what features are important and encode them.

def gender_features(word):
    return {'last letter': word[-1]}
gender_features('shrek')     ## output of this is a feature set

{'last letter': 'k'}

In [3]:
from nltk.corpus import names
import random

names1 = ([(name,'male') for name in names.words('male.txt')] + 
         [(name,'female') for name in names.words('female.txt')])
random.shuffle(names1)

In [4]:
# training a naive bayes classifier

featuresets = [(gender_features(n),g) for (n,g) in names1]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [7]:
classifier.classify(gender_features('adam'))

'male'

In [6]:
classifier.classify(gender_features('eve'))

'female'

In [8]:
print(nltk.classify.accuracy(classifier, test_set))

0.77


In [10]:
# compute likelihood ratios. Interpret this as:- if the last letter in a name is 'a',
# then is 32.9 times more likely for the name to be a female name than a male name.

classifier.show_most_informative_features(5)

Most Informative Features
             last letter = 'a'            female : male   =     32.9 : 1.0
             last letter = 'k'              male : female =     31.9 : 1.0
             last letter = 'f'              male : female =     15.4 : 1.0
             last letter = 'p'              male : female =     11.3 : 1.0
             last letter = 'v'              male : female =     11.3 : 1.0


In [11]:
# demonstrating overfitting with a very detailed feature extraction. training not generalizing 
# well to new data refers to the problem of overfitting. Here we are taking the first an last 
# letters of a name as features.

def gender_features2(name):
    features = {}
    features['first letter'] = name[0].lower()
    features['last letter'] = name[-1].lower()
    letters = 'abcdefghijklmnopqrstuvwxyz'
    for l in letters:
        features['count(%s)'% l] = name.lower().count(l)
        features['has(%s)'% l] = (l in name.lower())
    return features

featuresets = [(gender_features2(n),g) for (n,g) in names1]
train_set, test_set = featuresets[500:],featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier,test_set))

0.77


In [12]:
# for properly evaluating a model we create a development set - divide it into training set
# and dev-test set. training trains the model and dev-test does error analysis. test set is the 
# final evalutation system

train_names = names1[1500:]
devtest_names = names1[500:1500]
test_names = names1[:500]

train_set = [(gender_features(n),g) for (n,g) in train_names]
devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]
test_set = [(gender_features(n),g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier,devtest_set))

0.766


In [13]:
# generate list of error classifier makes in devtest

errors = []

for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))

# make changes to featureset based on the errors generated

for (tag, guess, name) in sorted(errors):
    print("correct=%-8s guess=%-8s name=%-30s"%(tag,guess,name))

len(errors)

correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Aleen                         
correct=female   guess=male     name=Allis                         
correct=female   guess=male     name=Anett                         
correct=female   guess=male     name=Anne-Mar                      
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Aryn                          
correct=female   guess=male     name=Aurel                         
correct=female   guess=male     name=Bess                          
correct=female   guess=male     name=Betteann                      
correct=female   guess=male     name=Bidget                        
correct=female   guess=male     name=Blair                         
correct=female   guess=male     name=Bridget                       
correct=female   guess=male     name=Bridgett                      
correct=female   guess=male     name=Cal        

234

In [27]:
# we see that sometimes last two letters can be indicative of the gender. 'ch' ending tends to 
# be associated with male even though 'h' is associated more with female. Adjust featureset

def gender_features(word):
    return {'suffix 1': word[-1:], 'suffix 2': word[-2:]}

# rebuild the classifier

train_set = [(gender_features(n),g) for (n,g) in train_names]
devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.763


In [30]:
# categorizing movie reviews as positive or negative

from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)),category) for category
            in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

In [32]:
# define as a feature as whether a document contains a certain word or not
# we make a list of 2000 most frequent words and then define a feature extractor that
# check whether that document has the word or not

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words.keys())[:2000]

def document_features(document):
    docwords = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)'%word] = (word in docwords)
    return features

print(document_features(movie_reviews.words('pos/cv957_8737.txt')))

In [34]:
# now use the defined feature extractor to train a classifier that labels movie reviews. Interpret this
# as:- if a review contains the word 'schumac'

featuresets = [(document_features(d),c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier,test_set))
classifier.show_most_informative_features(5)

0.79
Most Informative Features
    contains(schumacher) = True              neg : pos    =      7.3 : 1.0
        contains(shoddy) = True              neg : pos    =      6.9 : 1.0
 contains(unimaginative) = True              neg : pos    =      6.9 : 1.0
     contains(atrocious) = True              neg : pos    =      6.5 : 1.0
        contains(turkey) = True              neg : pos    =      6.3 : 1.0
