# Assignment 2: NLP

## Envirnment Setup

Importing the dataset (needs to be done only once per notebook)

In [0]:
# needs to be run only once per notebook
# !wget "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
# !tar -xzf aclImdb_v1.tar.gz

In [2]:
!cat aclImdb/imdb.vocab | wc -l # number of vocab words

89526


## Imports

In [35]:
import nltk # natural language tool kit: for text pre-processing
import os # for listing directories
from bs4 import BeautifulSoup as bs # library for removing html tags from text
import numpy as np # no comment :P
from nltk.corpus import stopwords # a set of common stopwords from nltk
from gensim import models
import gensim
from collections import namedtuple

In [None]:
# download resources for nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [34]:
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '``', "''", '...','the','for',"'s","'m"])

[nltk_data] Downloading package punkt to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Reading the dataset and preprocessing the text

In [10]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def penn_to_wn(tag):
    return get_wordnet_pos(tag)

In [48]:
def read_data(data_set_path):
    data = []
    
    for file_name in os.listdir(data_set_path):
        file_path = os.path.join(data_set_path, file_name)
        file = open(file_path, 'r')

        # read raw text
        text = file.read()

        # remove html tags
        text = bs(text, "html.parser").get_text()

        # tokenize the text
        tokens = nltk.word_tokenize(text)

        # get the part of speech tags for the tokens
        # This gives each token a tag from [NOUN, VERB, ADJ, ADV] which are
        # used by the lemmatizer to correctly lemmatize a word
        tagged_text = nltk.pos_tag(tokens)

        # lowercase, remove stop words, and lemmatize
        tokens = [wnl.lemmatize(tok.lower(), penn_to_wn(tag)) for (tok, tag) in tagged_text if tok.lower() not in stop_words and not tok.isdigit()]
        data.append(tokens)
    return data

In [44]:
train_pos = []
train_neg = []

wnl = nltk.WordNetLemmatizer()

In [None]:
train_pos = read_data('aclImdb/train/pos')
print("read {} positive training reviews".format(len(train_pos)))

train_neg = read_data('aclImdb/train/neg')
print("read {} negative training reviews".format(len(train_neg)))

train_data = train_pos + train_neg
print(len(train_data))

In [50]:
test_pos = read_data('aclImdb/test/pos')
print("read {} positive test reviews".format(len(test_pos)))

test_neg = read_data('aclImdb/test/neg')
print("read {} negative test reviews".format(len(test_neg)))

test_data = test_pos + test_neg
print(len(test_data))

read 12500 positive test reviews
read 12500 negative test reviews
25000


In [133]:
alldata = open('alldata.txt', 'w')
alldata.writelines(["%s\n" % " ".join(str(x) for x in item) for item in train_pos])
alldata.writelines(["%s\n" % " ".join(str(x) for x in item) for item in train_neg])
alldata.writelines(["%s\n" % " ".join(str(x) for x in item) for item in test_pos])
alldata.writelines(["%s\n" % " ".join(str(x) for x in item) for item in test_neg])

["bromwell high cartoon comedy run time program school life teacher year teaching profession lead believe bromwell high 's satire much closer reality teacher scramble survive financially insightful student see right pathetic teacher pomp pettiness whole situation remind school know student saw episode student repeatedly try burn school immediately recall high classic line inspector 'm sack one teacher student welcome bromwell high expect many adult age think bromwell high far fetch pity n't\n",
 "homelessness houselessness george carlin state issue year never plan help street consider human everything go school work vote matter people think homeless lose cause worry thing racism war iraq pressure kid succeed technology election inflation worry 'll next end streets.but give bet live street month without luxury home entertainment set bathroom picture wall computer everything treasure see 's like homeless goddard bolt 's lesson.mel brook direct star bolt play rich man everything world dec

## Read document vectors generated by Doc2VecC

In [136]:
doc_vecs = []
doc_vecs_file = open('doc2vec/iclr2017/docvectors.txt')

for line in doc_vecs_file.readlines():
    vec = line.split(" ")
    vec = [float(item) for item in vec if item != '\n']
    doc_vecs.append(vec)

train_doc_vecs = doc_vecs[0:25000]
test_doc_vecs = doc_vecs[25000:50000]

[0.115428, -0.122775, 0.038163, -0.057631, -0.117168, 0.044834, -0.005763, 0.016905, -0.01164, 0.066232, 0.004723, 0.157959, -0.029911, 0.050959, 0.090796, -0.074736, 0.186286, -0.045554, 0.018389, -0.080962, -0.156569, -0.028663, -0.019741, -0.07807, 0.132337, -0.006993, -0.110796, 0.207874, 0.092605, 0.056252, -0.061822, -0.064092, 0.126144, -0.002254, -0.023866, 0.05431, 0.067138, -0.008669, 0.240269, -0.049211, 0.164307, -0.002597, -0.231665, -0.253742, 0.045039, 0.183935, -0.048376, 0.243383, 0.058783, -0.01685, 0.20343, 0.276828, 0.118796, 0.082972, 0.11543, 0.130396, -0.038576, 0.154531, 0.053379, -0.139731, -0.078148, 0.027786, -0.167061, 0.016635, 0.131716, -0.082826, 0.026124, -0.18556, 0.059416, -0.05898, 0.030114, -0.081677, 0.1408, -0.10779, -0.139988, 0.101973, -0.014127, -0.024293, 0.021694, 0.003657, 0.04697, 0.157265, 0.019052, 0.02422, -0.067404, -0.105467, -0.139966, 0.218714, 0.081317, -0.094956, -0.053447, 0.016371, -0.028519, -0.016375, -0.019913, -0.076802, 0.017

## Classification with logistic regression

In [125]:
from sklearn.linear_model import LogisticRegression

In [126]:
from sklearn import preprocessing

In [137]:
clf = LogisticRegression()

clf.fit(train_doc_vecs, [1]*12500 + [0]*12500)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [138]:
clf.score(test_doc_vecs, [1]*12500 + [0]*12500)

0.89164

## classification with RBF kernel SVM

In [139]:
from sklearn.svm import SVC

In [144]:
rbf_clf = SVC(kernel='rbf', C = 100)
rbf_clf.fit(train_doc_vecs, [1]*12500 + [0]*12500)
rbf_clf.score(test_doc_vecs, [1]*12500 + [0]*12500)

0.8912

In [145]:
from sklearn.ensemble import RandomForestClassifier

In [148]:
RF_clf = RandomForestClassifier(n_estimators=500)
RF_clf.fit(train_doc_vecs, [1]*12500 + [0]*12500)
RF_clf.score(test_doc_vecs, [1]*12500 + [0]*12500)

0.85512

In [150]:
from sklearn.ensemble import AdaBoostClassifier
ADA_clf = AdaBoostClassifier(n_estimators = 100)
ADA_clf.fit(train_doc_vecs, [1]*12500 + [0]*12500)
ADA_clf.score(test_doc_vecs, [1]*12500 + [0]*12500)

0.85804

In [151]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_doc_vecs, [1]*12500 + [0]*12500)
gnb.score(test_doc_vecs, [1]*12500 + [0]*12500)

0.80124

In [152]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=7TfidfVectorizer)
neigh.fit(train_doc_vecs, [1]*12500 + [0]*12500)
neigh.score(test_doc_vecs, [1]*12500 + [0]*12500)

0.81304

In [None]:
TfidfVectorizer()