In [1]:
from nltk.corpus import movie_reviews
from sklearn.model_selection import StratifiedShuffleSplit
import nltk
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

In [2]:
def get_data():
    dataset = []
    y_labels = []
    for cat in movie_reviews.categories():
        for fileid in movie_reviews.fileids(cat):
            words = list(movie_reviews.words(fileid))
            dataset.append((words, cat))
            y_labels.append(cat)
    return dataset, y_labels
            

In [4]:
def get_train_test(input_dataset, ylabels):
    train_size = 0.7
    test_size = 1 - train_size
    stratified_split = StratifiedShuffleSplit(test_size=test_size, n_splits=1, random_state=77)
    
    for train_indx, test_indx in stratified_split.split(input_dataset, ylabels):
        train = [input_dataset[i] for i in train_indx]
        train_y = [ylabels[i] for i in train_indx]
        
        test = [input_dataset[i] for i in test_indx]
        test_y = [ylabels[i] for i in test_indx]
    return train, test, train_y, test_y
        
        

In [6]:
def build_word_features(instance):
    feature_set = {}
    words = instance[0]
    for word in words:
        feature_set[word] = 1
        
    return (feature_set, instance[1])

In [7]:
def build_negate_features(instance):
    words = instance[0]
    final_words = []
    negate = False
    negate_words = ['no', 'not']
    for word in words:
        if negate:
            word = 'Not_' + word
            negate = False
        if word not in negate_words:
            final_words.append(word)
        else:
            negate = True
    
    feature_set = {}
    for word in final_words:
        feature_set[word] = 1
    return (feature_set, instance[1])

In [8]:
def remove_stop_words(in_data):
    stopword_list = stopwords.words('english')
    negate_words = ['no', 'not']
    new_stopwords = [word for word in stopword_list if word not in negate_words]
    label = in_data[1]
    words = [word for word in in_data[0] if word not in new_stopwords]
    return (words, label)

In [9]:
def build_keyphrase_features(instance):
    feature_set = {}
    instance = remove_stop_words(instance)
    words = instance[0]
    
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.raw_freq, 400)
    for bigram in bigrams:
        feature_set[bigram] = 1
    return (feature_set, instance[1])

In [10]:
def build_model(features):
    model = nltk.NaiveBayesClassifier.train(features)
    return model

In [11]:
def probe_model(model, features, dataset_type='Train'):
    accuracy = nltk.classify.accuracy(model, features)
    print('\n' + dataset_type + 'Accuracy = %.2f'%(accuracy*100) + '%')
    

In [12]:
def show_features(model, no_features=5):
    print('\nFeature Importance')
    print('='*80 + '\n')
    print(model.show_most_informative_features(no_features))

In [13]:
def build_model_cycle_1(train_data, dev_data):
    train_features = map(build_word_features, train_data)
    dev_features = map(build_word_features, dev_data)
    model = build_model(train_features)
    probe_model(model, train_features)
    probe_model(model, dev_features, 'Dev')
    return model

In [14]:
def build_model_cycle_2(train_data, dev_data):
    train_features = map(build_negate_features, train_data)
    dev_features = map(build_negate_features, dev_data)
    model = build_model(train_features)
    probe_model(model, train_features)
    probe_model(model, dev_features, 'Dev')
    return model

In [15]:
def build_model_cycle_3(train_data, dev_data):
    train_features = map(build_keyphrase_features, train_data)
    dev_features = map(build_keyphrase_features, dev_data)
    model = build_model(train_features)
    probe_model(model, train_features)
    probe_model(model, dev_features, 'Dev')
    return model

In [16]:
input_dataset, y_labels = get_data()
train_data, all_test_data, train_y, all_test_y = get_train_test(input_dataset, y_labels)
dev_data, test_data, dev_y, test_y = get_train_test(all_test_data, all_test_y)

print('\nOriginal Data Size = ', len(input_dataset))
print('\nTraining Data Size = ', len(train_data))
print('\nDev Data Size = ', len(dev_data))
print('\nTest Data Size = ', len(test_data))

model_cycle_1 = build_model_cycle_1(train_data, dev_data)
show_features(model_cycle_1)

model_cycle_2 = build_model_cycle_2(train_data, dev_data)
show_features(model_cycle_2)

model_cycle_3 = build_model_cycle_3(train_data, dev_data)
show_features(model_cycle_3)



Original Data Size =  2000

Training Data Size =  1399

Dev Data Size =  420

Test Data Size =  181

TrainAccuracy = 0.00%

DevAccuracy = 0.00%

Feature Importance

Most Informative Features
               stupidity = 1                 neg : pos    =     15.6 : 1.0
                  warned = 1                 neg : pos    =     11.7 : 1.0
             wonderfully = 1                 pos : neg    =     11.5 : 1.0
             outstanding = 1                 pos : neg    =     11.0 : 1.0
               ludicrous = 1                 neg : pos    =     11.0 : 1.0
None

TrainAccuracy = 0.00%

DevAccuracy = 0.00%

Feature Importance

Most Informative Features
               stupidity = 1                 neg : pos    =     15.6 : 1.0
             wonderfully = 1                 pos : neg    =     14.7 : 1.0
               Not_funny = 1                 neg : pos    =     13.0 : 1.0
                  warned = 1                 neg : pos    =     11.7 : 1.0
             outstanding = 1         