# Assignment 2: NLP

## Envirnment Setup

Importing the dataset (needs to be done only once per notebook)

In [0]:
# needs to be run only once per notebook
# !wget "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
# !tar -xzf aclImdb_v1.tar.gz

In [2]:
!cat aclImdb/imdb.vocab | wc -l # number of vocab words

89526


## Imports

In [35]:
import nltk # natural language tool kit: for text pre-processing
import os # for listing directories
from bs4 import BeautifulSoup as bs # library for removing html tags from text
import numpy as np # no comment :P
from nltk.corpus import stopwords # a set of common stopwords from nltk
from gensim import models
import gensim
from collections import namedtuple

In [None]:
# download resources for nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [34]:
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '``', "''", '...'])

[nltk_data] Downloading package punkt to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Reading the dataset and preprocessing the text

In [10]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def penn_to_wn(tag):
    return get_wordnet_pos(tag)

In [48]:
def read_data(data_set_path):
    data = []
    
    for file_name in os.listdir(data_set_path):
        file_path = os.path.join(data_set_path, file_name)
        file = open(file_path, 'r')

        # read raw text
        text = file.read()

        # remove html tags
        text = bs(text, "html.parser").get_text()

        # tokenize the text
        tokens = nltk.word_tokenize(text)

        # get the part of speech tags for the tokens
        # This gives each token a tag from [NOUN, VERB, ADJ, ADV] which are
        # used by the lemmatizer to correctly lemmatize a word
        tagged_text = nltk.pos_tag(tokens)

        # lowercase, remove stop words, and lemmatize
        tokens = [wnl.lemmatize(tok.lower(), penn_to_wn(tag)) for (tok, tag) in tagged_text if tok.lower() not in stop_words and not tok.isdigit()]
        data.append(tokens)
    return data

In [44]:
train_pos = []
train_neg = []

wnl = nltk.WordNetLemmatizer()

In [None]:
train_pos = read_data('aclImdb/train/pos')
print("read {} positive training reviews".format(len(train_pos)))

train_neg = read_data('aclImdb/train/neg')
print("read {} negative training reviews".format(len(train_neg)))

train_data = train_pos + train_neg
print(len(train_data))

In [None]:
test_pos = read_data('aclImdb/test/pos')
print("read {} positive test reviews".format(len(test_pos)))

test_neg = read_data('aclImdb/test/neg')
print("read {} negative test reviews".format(len(test_neg)))

test_data = test_pos + test_neg
print(len(test_data))

In [39]:
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')

reviews_pos = [analyzedDocument(review, [1]) for review in train_pos]
reviews_neg = [analyzedDocument(review, [0]) for review in train_neg]

reviews = reviews_pos + reviews_neg
print(len(reviews))

12500
AnalyzedDocument(words=['bromwell', 'high', 'cartoon', 'comedy', 'run', 'time', 'program', 'school', 'life', 'teacher', 'year', 'teaching', 'profession', 'lead', 'believe', 'bromwell', 'high', "'s", 'satire', 'much', 'closer', 'reality', 'teacher', 'scramble', 'survive', 'financially', 'insightful', 'student', 'see', 'right', 'pathetic', 'teacher', 'pomp', 'pettiness', 'whole', 'situation', 'remind', 'school', 'know', 'student', 'saw', 'episode', 'student', 'repeatedly', 'try', 'burn', 'school', 'immediately', 'recall', 'high', 'classic', 'line', 'inspector', "'m", 'sack', 'one', 'teacher', 'student', 'welcome', 'bromwell', 'high', 'expect', 'many', 'adult', 'age', 'think', 'bromwell', 'high', 'far', 'fetch', 'pity', "n't"], tags=[1])


In [38]:

model = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=20, workers=4)
model.build_vocab(reviews)

In [40]:
%time model.train(reviews_pos, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 15.3 s, sys: 574 ms, total: 15.8 s
Wall time: 6.39 s


In [42]:
test_review = open('aclImdb/test/pos/0_10.txt').read().split(" ")

In [43]:
model.infer_vector(test_review)

array([-0.14944917,  0.02543032,  0.01426453,  0.11886498,  0.01544726,
       -0.17427683, -0.21819073, -0.19616355, -0.12649302, -0.33937413,
        0.06300315, -0.26877198, -0.30317315, -0.16984855,  0.2297405 ,
       -0.07325692, -0.09865372,  0.1307081 , -0.19872205, -0.11785274,
        0.07041273, -0.09396325,  0.08320303, -0.27827483,  0.01516401,
        0.1107842 ,  0.222814  ,  0.35283846,  0.08384296, -0.47336835,
       -0.31745058,  0.14323752,  0.1522411 ,  0.03053654, -0.07243047,
       -0.2518567 ,  0.26350093,  0.13779737, -0.27833223, -0.29480627,
        0.19427712, -0.11107991, -0.1741766 , -0.03633292, -0.05126939,
       -0.21854027,  0.01630594,  0.07055961, -0.14734994, -0.12947616],
      dtype=float32)