# Assignment 2: NLP

## Envirnment Setup

Importing the dataset (needs to be done only once per notebook)

In [0]:
# needs to be run only once per notebook
# !wget "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
# !tar -xzf aclImdb_v1.tar.gz

In [2]:
!cat aclImdb/imdb.vocab | wc -l # number of vocab words

89526


## Imports

In [35]:
import nltk # natural language tool kit: for text pre-processing
import os # for listing directories
from bs4 import BeautifulSoup as bs # library for removing html tags from text
import numpy as np # no comment :P
from nltk.corpus import stopwords # a set of common stopwords from nltk
from gensim import models
import gensim
from collections import namedtuple

In [None]:
# download resources for nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [34]:
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '``', "''", '...','the','for',"'s","'m"])

[nltk_data] Downloading package punkt to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Reading the dataset and preprocessing the text

In [10]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def penn_to_wn(tag):
    return get_wordnet_pos(tag)

In [48]:
def read_data(data_set_path):
    data = []
    
    for file_name in os.listdir(data_set_path):
        file_path = os.path.join(data_set_path, file_name)
        file = open(file_path, 'r')

        # read raw text
        text = file.read()

        # remove html tags
        text = bs(text, "html.parser").get_text()

        # tokenize the text
        tokens = nltk.word_tokenize(text)

        # get the part of speech tags for the tokens
        # This gives each token a tag from [NOUN, VERB, ADJ, ADV] which are
        # used by the lemmatizer to correctly lemmatize a word
        tagged_text = nltk.pos_tag(tokens)

        # lowercase, remove stop words, and lemmatize
        tokens = [wnl.lemmatize(tok.lower(), penn_to_wn(tag)) for (tok, tag) in tagged_text if tok.lower() not in stop_words and not tok.isdigit()]
        data.append(tokens)
    return data

In [44]:
train_pos = []
train_neg = []

wnl = nltk.WordNetLemmatizer()

In [None]:
train_pos = read_data('aclImdb/train/pos')
print("read {} positive training reviews".format(len(train_pos)))

train_neg = read_data('aclImdb/train/neg')
print("read {} negative training reviews".format(len(train_neg)))

train_data = train_pos + train_neg
print(len(train_data))

In [50]:
test_pos = read_data('aclImdb/test/pos')
print("read {} positive test reviews".format(len(test_pos)))

test_neg = read_data('aclImdb/test/neg')
print("read {} negative test reviews".format(len(test_neg)))

test_data = test_pos + test_neg
print(len(test_data))

read 12500 positive test reviews
read 12500 negative test reviews
25000


In [74]:
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')

reviews_pos = [analyzedDocument(review, [i]) for i,review in zip(range(0, 12500), train_pos)]
reviews_neg = [analyzedDocument(review, [i]) for i,review in zip(range(12500, 25000), train_neg)]

reviews = reviews_pos + reviews_neg
print(len(reviews))

25000


In [121]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=200, workers=4)
model.build_vocab(reviews)

In [122]:
%time model.train(reviews_pos, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 24.7 s, sys: 487 ms, total: 25.2 s
Wall time: 7.45 s


## Classification with logistic regression

In [123]:
#model.docvecs.vectors_docs

In [124]:
test_reviews = np.empty([25000, 300])

for idx, review in enumerate(test_data):
    inferred_vector = model.infer_vector(review)
    test_reviews[idx] = inferred_vector

print(test_reviews.shape)

(25000, 300)


In [125]:
from sklearn.linear_model import LogisticRegression

In [126]:
from sklearn import preprocessing

In [127]:
clf = LogisticRegression()

train_pos_vec = []
train_neg_vec = []

for x in range(0, 12500):
    train_pos_vec.append(model.docvecs[x])


for x in range(12500, 25000):
    train_neg_vec.append(model.docvecs[x])

    
clf.fit(train_pos_vec+train_neg_vec, [1]*12500 + [0]*12500)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [128]:
clf.score(preprocessing.scale(test_reviews), [1]*12500 + [0]*12500)

0.6562