# Assignment 2: NLP

## Envirnment Setup

Importing the dataset (needs to be done only once per notebook)

In [0]:
# needs to be run only once per notebook
# !wget "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
# !tar -xzf aclImdb_v1.tar.gz

In [2]:
!cat aclImdb/imdb.vocab | wc -l # number of vocab words

89526


## Imports

In [35]:
import nltk # natural language tool kit: for text pre-processing
import os # for listing directories
from bs4 import BeautifulSoup as bs # library for removing html tags from text
import numpy as np # no comment :P
from nltk.corpus import stopwords # a set of common stopwords from nltk
from gensim import models
import gensim
from collections import namedtuple

In [None]:
# download resources for nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [None]:
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '``', "''", '...','the','for',"'s","'m"])

## Reading the dataset and preprocessing the text

In [10]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def penn_to_wn(tag):
    return get_wordnet_pos(tag)

In [48]:
def read_data(data_set_path):
    data = []
    
    for file_name in os.listdir(data_set_path):
        file_path = os.path.join(data_set_path, file_name)
        file = open(file_path, 'r')

        # read raw text
        text = file.read()

        # remove html tags
        text = bs(text, "html.parser").get_text()

        # tokenize the text
        tokens = nltk.word_tokenize(text)

        # get the part of speech tags for the tokens
        # This gives each token a tag from [NOUN, VERB, ADJ, ADV] which are
        # used by the lemmatizer to correctly lemmatize a word
        tagged_text = nltk.pos_tag(tokens)

        # lowercase, remove stop words, and lemmatize
        tokens = [wnl.lemmatize(tok.lower(), penn_to_wn(tag)) for (tok, tag) in tagged_text if tok.lower() not in stop_words and not tok.isdigit()]
        data.append(tokens)
    return data

In [44]:
train_pos = []
train_neg = []

wnl = nltk.WordNetLemmatizer()

In [None]:
train_pos = read_data('aclImdb/train/pos')
print("read {} positive training reviews".format(len(train_pos)))

train_neg = read_data('aclImdb/train/neg')
print("read {} negative training reviews".format(len(train_neg)))

train_data = train_pos + train_neg
print(len(train_data))

In [50]:
test_pos = read_data('aclImdb/test/pos')
print("read {} positive test reviews".format(len(test_pos)))

test_neg = read_data('aclImdb/test/neg')
print("read {} negative test reviews".format(len(test_neg)))

test_data = test_pos + test_neg
print(len(test_data))

read 12500 positive test reviews
read 12500 negative test reviews
25000


In [45]:
alldata = open('alldata.txt', 'w')
alldata.writelines(["%s\n" % " ".join(str(x) for x in item) for item in train_pos])
alldata.writelines(["%s\n" % " ".join(str(x) for x in item) for item in train_neg])
alldata.writelines(["%s\n" % " ".join(str(x) for x in item) for item in test_pos])
alldata.writelines(["%s\n" % " ".join(str(x) for x in item) for item in test_neg])

## Read document vectors generated by Doc2VecC

In [50]:
doc_vecs = []
doc_vecs_file = open('doc2vec/iclr2017/docvectors.txt')

for line in doc_vecs_file.readlines():
    vec = line.split(" ")
    vec = [float(item) for item in vec if item != '\n']
    doc_vecs.append(vec)

train_doc_vecs = doc_vecs[0:25000]
test_doc_vecs = doc_vecs[25000:50000]

## Classification with logistic regression

In [47]:
from sklearn.linear_model import LogisticRegression

In [48]:
from sklearn import preprocessing

In [52]:
clf = LogisticRegression()

# clf.fit(train_doc_vecs, [1]*12500 + [0]*12500)

In [138]:
clf.score(test_doc_vecs, [1]*12500 + [0]*12500)

0.89164

## classification with RBF kernel SVM

In [139]:
from sklearn.svm import SVC

In [144]:
rbf_clf = SVC(kernel='rbf', C = 100)
rbf_clf.fit(train_doc_vecs, [1]*12500 + [0]*12500)
rbf_clf.score(test_doc_vecs, [1]*12500 + [0]*12500)

0.8912

In [145]:
from sklearn.ensemble import RandomForestClassifier

In [148]:
RF_clf = RandomForestClassifier(n_estimators=500)
RF_clf.fit(train_doc_vecs, [1]*12500 + [0]*12500)
RF_clf.score(test_doc_vecs, [1]*12500 + [0]*12500)

0.85512

In [150]:
from sklearn.ensemble import AdaBoostClassifier
ADA_clf = AdaBoostClassifier(n_estimators = 100)
ADA_clf.fit(train_doc_vecs, [1]*12500 + [0]*12500)
ADA_clf.score(test_doc_vecs, [1]*12500 + [0]*12500)

0.85804

In [151]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_doc_vecs, [1]*12500 + [0]*12500)
gnb.score(test_doc_vecs, [1]*12500 + [0]*12500)

0.80124

In [152]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=7TfidfVectorizer)
neigh.fit(train_doc_vecs, [1]*12500 + [0]*12500)
neigh.score(test_doc_vecs, [1]*12500 + [0]*12500)

0.81304

In [None]:
TfidfVectorizer()