# Assignment 2: NLP

## Envirnment Setup

Importing the dataset (needs to be done only once per notebook)

In [1]:
# needs to be run only once per notebook
# !wget "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
# !tar -xzf aclImdb_v1.tar.gz

In [2]:
!cat aclImdb/imdb.vocab | wc -l # number of vocab words

89526


## Imports

In [3]:
import nltk # natural language tool kit: for text pre-processing
import os # for listing directories
from bs4 import BeautifulSoup as bs # library for removing html tags from text
import numpy as np # no comment :P
from nltk.corpus import stopwords # a set of common stopwords from nltk
from gensim import models
import gensim
from collections import namedtuple
from helpers import *
%load_ext autoreload
%autoreload 2

In [4]:
# download resources for nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/amr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/amr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/amr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/amr/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '``', "''", '...','the','for',"'s","'m"])

## Reading the dataset and preprocessing the text

In [6]:
train_pos = []
train_neg = []

wnl = nltk.WordNetLemmatizer()

In [7]:
train_pos = read_data('aclImdb/train/pos')
print("read {} positive training reviews".format(len(train_pos)))

train_neg = read_data('aclImdb/train/neg')
print("read {} negative training reviews".format(len(train_neg)))

train_unsup = read_data('aclImdb/train/unsup')
print("read {} unsupervised training reviews".format(len(train_unsup)))

train_data = train_pos + train_neg + train_unsup
print(len(train_data))

read 12500 positive training reviews
read 12500 negative training reviews
read 50000 unsupervised training reviews
75000


In [8]:
test_pos = read_data('aclImdb/test/pos')
print("read {} positive test reviews".format(len(test_pos)))

test_neg = read_data('aclImdb/test/neg')
print("read {} negative test reviews".format(len(test_neg)))

test_data = test_pos + test_neg
print(len(test_data))

read 12500 positive test reviews
read 12500 negative test reviews
25000


In [9]:
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')

reviews_pos = [analyzedDocument(review, [i]) for i,review in zip(range(0, 12500), train_pos)]
reviews_neg = [analyzedDocument(review, [i]) for i,review in zip(range(12500, 25000), train_neg)]
reviews_unsup = [analyzedDocument(review, [i]) for i,review in zip(range(25000, 75000), train_unsup)]

reviews = reviews_pos + reviews_neg + reviews_unsup
print(len(reviews))

75000


In [10]:
pos_model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=200, workers=4, negative=20)
neg_model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=200, workers=4, negative=20)
pos_model.build_vocab(reviews)
neg_model.build_vocab(reviews)

In [12]:
for epoch in range(10):
    pos_model.train(reviews, total_examples=pos_model.corpus_count, epochs=pos_model.epochs)
    pos_model.alpha -= 0.002  # decrease the learning rate
    pos_model.min_alpha = pos_model.alpha  # fix the learning rate, no decay
    neg_model.train(reviews, total_examples=neg_model.corpus_count, epochs=neg_model.epochs)
    neg_model.alpha -= 0.002  # decrease the learning rate
    neg_model.min_alpha = neg_model.alpha  # fix the learning rate, no decay

## Classification with logistic regression

In [None]:
#model.docvecs.vectors_docs

In [15]:
test_reviews = np.empty([25000, 600])

for idx, review in enumerate(test_data):
    inferred_vector = np.concatenate((pos_model.infer_vector(review), neg_model.infer_vector(review)), axis=0)
    test_reviews[idx] = inferred_vector

print(test_reviews.shape)

(25000, 600)


In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
from sklearn import preprocessing

In [31]:
clf = LogisticRegression()

train_pos_vec = []
train_neg_vec = []
train_unsup_vec = []

for x in range(0, 12500):
    train_pos_vec.append(np.concatenate((pos_model.docvecs[x], neg_model.docvecs[x]), axis=0))

for x in range(12500, 25000):
    train_neg_vec.append(np.concatenate((pos_model.docvecs[x], neg_model.docvecs[x]), axis=0))
        
clf.fit(train_pos_vec+train_neg_vec, [1]*12500 + [0]*12500)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
clf.score(preprocessing.scale(test_reviews), [1]*12500 + [0]*12500)

0.63051999999999997

In [33]:
from sklearn.svm import SVC

In [None]:
rbf_clf = SVC(kernel='rbf', C = 1)
rbf_clf.fit(np.concatenate((train_pos_vec, train_neg_vec), axis=0), [1]*12500 + [0]*12500)
rbf_clf.score(test_reviews, [1]*12500 + [0]*12500)