# Assignment 2: NLP

## Envirnment Setup

Importing the dataset (needs to be done only once per notebook)

In [0]:
# needs to be run only once per notebook
# !wget "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
# !tar -xzf aclImdb_v1.tar.gz

In [1]:
!cat aclImdb/imdb.vocab | wc -l # number of vocab words

89526


## Imports

In [2]:
import nltk # natural language tool kit: for text pre-processing
import os # for listing directories
from bs4 import BeautifulSoup as bs # library for removing html tags from text
import numpy as np # no comment :P
from nltk.corpus import stopwords # a set of common stopwords from nltk
from gensim import models
import gensim
from collections import namedtuple

In [3]:
# download resources for nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '``', "''", '...'])

## Reading the dataset and preprocessing the text

In [5]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def penn_to_wn(tag):
    return get_wordnet_pos(tag)

In [39]:
def read_data(data_set_path):
    data = []
    
    if any("all.txt" in s for s in os.listdir(data_set_path)):
        # read the reviews line by line
        all_data_file_path = os.path.join(data_set_path, 'all.txt')
        all_data_file = open(all_data_file_path, 'r')
        for line in all_data_file.readlines():
            vec = line.split(" ")
            vec = [item for item in vec if item != '\n']
            data.append(vec)
        return data
        
    
    for file_name in os.listdir(data_set_path):
        file_path = os.path.join(data_set_path, file_name)
        file = open(file_path, 'r')

        # read raw text
        text = file.read()

        # remove html tags
        text = bs(text, "html.parser").get_text()

        # tokenize the text
        tokens = nltk.word_tokenize(text)

        # get the part of speech tags for the tokens
        # This gives each token a tag from [NOUN, VERB, ADJ, ADV] which are
        # used by the lemmatizer to correctly lemmatize a word
        tagged_text = nltk.pos_tag(tokens)

        # lowercase, remove stop words, and lemmatize
        tokens = [wnl.lemmatize(tok.lower(), penn_to_wn(tag)) for (tok, tag) in tagged_text if tok.lower() not in stop_words and not tok.isdigit()]
        data.append(tokens)
    
    all_data_file_path = os.path.join(data_set_path, 'all.txt')
    all_data_file = open(all_data_file_path, 'w+')
    all_data_file.writelines(["%s\n" % " ".join(str(x) for x in item) for item in data])

    return data

In [40]:
train_pos = []
train_neg = []

wnl = nltk.WordNetLemmatizer()

In [56]:
train_pos = read_data('aclImdb/train/pos')
print("read {} positive training reviews".format(len(train_pos)))

train_neg = read_data('aclImdb/train/neg')
print("read {} negative training reviews".format(len(train_neg)))

train_data = train_pos + train_neg
print(len(train_data))

read 12500 positive training reviews
read 12500 negative training reviews
25000


In [57]:
test_pos = read_data('aclImdb/test/pos')
print("read {} positive test reviews".format(len(test_pos)))

test_neg = read_data('aclImdb/test/neg')
print("read {} negative test reviews".format(len(test_neg)))

test_data = test_pos + test_neg
print(len(test_data))

read 12500 positive test reviews
read 12500 negative test reviews
25000


## TF-IDF

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [59]:
# put positive and negative training data in one file
!cat aclImdb/train/pos/all.txt aclImdb/train/neg/all.txt > all_train.txt

In [83]:
tfidf = TfidfVectorizer()
all_train_file = open('all_train.txt', 'r')
tfidf.fit_transform([review for review in all_train_file.readlines()])

<25000x67109 sparse matrix of type '<class 'numpy.float64'>'
	with 2408630 stored elements in Compressed Sparse Row format>

In [84]:
print(doc_matrix.shape)

(25000, 67109)


In [85]:
word_idfs = dict(zip(tfidf.get_feature_names(), tfidf._tfidf.idf_))

## Word2Vec

In [86]:
from gensim.models import Word2Vec

In [90]:
word_dimensions = 100

model=Word2Vec(train_data, size = word_dimensions)

In [91]:

model.wv['cartoon']

array([ 1.1617316 , -1.1810275 ,  1.7089483 , -1.4388052 , -0.9129895 ,
       -0.19307204,  1.4702221 , -0.40685564, -0.18956885, -0.01083623,
       -0.7461476 , -0.08285325, -1.3399551 , -0.7335147 , -1.0785775 ,
        0.37544942, -0.40843806,  0.5222615 ,  0.65392613,  1.4214958 ,
       -0.46880654,  1.6329281 ,  0.30859506, -0.61773735, -0.41692367,
       -0.42304382, -0.8085791 , -0.3581976 , -0.4381851 ,  1.4749767 ,
       -1.1013819 , -1.0666571 , -0.89562565,  0.70554197,  0.42756954,
        0.99748874, -0.8049225 ,  0.10262369, -0.7526793 , -0.33935434,
       -0.5148289 , -0.0940445 ,  0.31421944, -0.02793522,  0.5785908 ,
       -0.34037593,  0.33859348, -0.8858172 ,  0.11720247,  0.34867325,
        0.03178314, -0.07964987, -0.24286842, -0.03939491,  0.01047644,
       -0.99116737,  1.3539529 , -0.03405824, -0.41418836, -0.25336492,
       -0.40225163, -0.0407508 , -0.03256291, -0.2230316 , -0.6170128 ,
       -0.34822702,  0.37247574,  0.26393715, -1.3389213 , -1.45

In [115]:
def get_doc_vecs_for_data(data):
    doc_vecs = np.empty([len(data), word_dimensions])

    for idx,review in enumerate(data):
        doc_vec = np.zeros([1, word_dimensions])
        for word in review:
            if word in model.wv and word in word_idfs:
                doc_vec += word_idfs[word]*model.wv[word]
        doc_vecs[idx] = doc_vec
    
    return doc_vecs

In [116]:
train_doc_vecs = get_doc_vecs_for_data(train_data)
test_doc_vecs = get_doc_vecs_for_data(test_data)

## Classifying using TF-IDF weighted embeddings

In [117]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

clf.fit(train_doc_vecs, [1]*12500 + [0]*12500)
clf.score(test_doc_vecs, [1]*12500 + [0]*12500)

0.809