# Assignment 2: NLP

## Envirnment Setup

Importing the dataset (needs to be done only once per notebook)

In [1]:
# needs to be run only once per notebook
# !wget "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
# !tar -xzf aclImdb_v1.tar.gz

In [2]:
!cat aclImdb/imdb.vocab | wc -l # number of vocab words

89526


## Imports

In [32]:
import nltk # natural language tool kit: for text pre-processing
import os # for listing directories
from bs4 import BeautifulSoup as bs # library for removing html tags from text
import numpy as np # no comment :P
from nltk.corpus import stopwords # a set of common stopwords from nltk
from gensim import models
import gensim
from collections import namedtuple
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import wordnet
from sklearn.model_selection import cross_val_score
from itertools import product
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

from pathlib import Path
# import helper functions
from helpers import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# download resources for nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/abdelrahman/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '``', "''", '...'])

## Reading the dataset and preprocessing the text

In [6]:
train_pos = []
train_neg = []
wnl = nltk.WordNetLemmatizer()

In [7]:
train_pos = read_data('aclImdb/train/pos')
print("read {} positive training reviews".format(len(train_pos)))

train_neg = read_data('aclImdb/train/neg')
print("read {} negative training reviews".format(len(train_neg)))

train_data = train_pos + train_neg
print(len(train_data))

read 12500 positive training reviews
read 12500 negative training reviews
25000


In [8]:
test_pos = read_data('aclImdb/test/pos')
print("read {} positive test reviews".format(len(test_pos)))

test_neg = read_data('aclImdb/test/neg')
print("read {} negative test reviews".format(len(test_neg)))

test_data = test_pos + test_neg
print(len(test_data))

read 12500 positive test reviews
read 12500 negative test reviews
25000


In [9]:
train_labels = np.array([1]*12500 + [0]*12500)
test_labels = np.array([1]*12500 + [0]*12500)

## Set up hyper parameters and classifiers for cross fold validation

Here we define the parameters to be tuned for each classifier and create a classifier object for each combination of parameters.

In [23]:
logistic_params = {'C': [0.1, 0.5, 1, 5, 10]}
random_forest_params = {'n_estimators': [10,50,100,300]}
ada_params = {'n_estimators': [10, 50, 100], 'base_estimator':[LogisticRegression(C=5)]}
knn_params = {'n_neighbors': [5, 11, 17]}

lr_clfs = get_clfs_for_combinations(LogisticRegression, logistic_params)
rf_clfs = get_clfs_for_combinations(RandomForestClassifier, random_forest_params)
ada_clfs = get_clfs_for_combinations(AdaBoostClassifier, ada_params)
knn_clfs = get_clfs_for_combinations(KNeighborsClassifier, knn_params)

In [11]:
def print_clf_scores(scores_dict, clf_params):
    for clf in scores_dict.keys():
        params = []
        for param in clf_params:
            params.append( (param, clf.get_params()[param]) )
        print("clf with params: {}, score: {}".format(params, scores_dict[clf]))

## TF-IDF

In [41]:
# put positive and negative training data in one file
!cat aclImdb/train/pos/all.txt aclImdb/train/neg/all.txt > all_train.txt
!cat aclImdb/test/pos/all.txt aclImdb/test/neg/all.txt > all_test.txt

In [42]:
tfidf = TfidfVectorizer()
all_train_file = open('all_train.txt', 'r')
train_doc_matrix = tfidf.fit_transform([review for review in all_train_file.readlines()])

In [43]:
print(len(tfidf.vocabulary_))

67109


In [44]:
all_test_file = open('all_test.txt', 'r')
test_doc_matrix = tfidf.transform([review for review in all_test_file.readlines()])

In [45]:
print(test_doc_matrix.shape)

(25000, 67109)


### cross validation testing

In [46]:
scores_dict = cross_validate(train_doc_matrix, train_labels, lr_clfs)
print_clf_scores(scores_dict, logistic_params)

clf with params: [('C', 5)], score: 0.8911199999999999
clf with params: [('C', 0.5)], score: 0.8810800000000001
clf with params: [('C', 10)], score: 0.8907999999999999
clf with params: [('C', 1)], score: 0.8872399999999999
clf with params: [('C', 0.1)], score: 0.86012


get IDF values for each word in the vocabulary to use it in weighting the word2vec vectors later on.

In [19]:
word_idfs = dict(zip(tfidf.get_feature_names(), tfidf._tfidf.idf_))

## Word2Vec trained on IMDB

In [20]:
word_dimensions = 100
model=Word2Vec(train_data, size = word_dimensions)

In [21]:
train_doc_vecs = get_doc_vecs_for_data(train_data, model.wv, word_dimensions, word_weights=word_idfs)
test_doc_vecs = get_doc_vecs_for_data(test_data, model.wv, word_dimensions, word_weights=word_idfs)

### cross validation testing using IDF weighted embeddings

In [None]:
scores_dict = cross_validate(train_doc_vecs, train_labels, lr_clfs)

In [25]:
print_clf_scores(scores_dict, logistic_params)

clf with params: [('C', 10)], score: 0.80792
clf with params: [('C', 1)], score: 0.80772
clf with params: [('C', 0.1)], score: 0.8051600000000001
clf with params: [('C', 5)], score: 0.8082800000000001
clf with params: [('C', 0.5)], score: 0.80704


## Pretrained GloVe Embeddings

In [None]:
glove_dict_300 = load_glove_dict(300)

In [None]:
glove_train_doc_vecs = get_doc_vecs_for_data(train_data, glove_dict_300, 300)
glove_test_doc_vecs = get_doc_vecs_for_data(test_data, glove_dict_300, 300)

In [None]:
%reset_selective -f glove_dict_300

### cross validation testing

In [None]:
scores_dict = cross_validate(glove_train_doc_vecs, train_labels, lr_clfs)
print_clf_scores(scores_dict, logisitic_params.keys())

In [None]:
scores_dict = cross_validate(glove_train_doc_vecs, train_labels, ada_clfs)
print_clf_scores(scores_dict, ada_params.keys())

In [None]:
scores_dict = cross_validate(glove_train_doc_vecs, train_labels, knn_clfs)
print_clf_scores(scores_dict, knn_params.keys())

# Testing on the Real Test Data

In [None]:
clf2 = LogisticRegression()
clf2.fit(glove_train_doc_vecs, train_labels)
clf2.score(glove_test_doc_vecs, test_labels)

# Preprocessing effect

We will try using TF-IDF on the raw text

In [47]:
class FileIterator:
    def __init__(self, dirname):
        self.dirname = dirname
    def __iter__(self):
        pathlist = Path(self.dirname).glob('*_*.txt')
        for path in pathlist:
            path_in_str = str(path)
            if os.path.isfile(path_in_str):
                f = open(path_in_str)
                yield f.read()

In [48]:
from itertools import chain

In [49]:
train_pos_iter = FileIterator('aclImdb/train/pos')
train_neg_iter = FileIterator('aclImdb/train/neg')
train_iter = chain(train_pos_iter, train_neg_iter)

test_pos_iter = FileIterator('aclImdb/test/pos')
test_neg_iter = FileIterator('aclImdb/test/neg')
test_iter = chain(test_pos_iter, test_neg_iter)

In [50]:
tfidf = TfidfVectorizer()
train_doc_matrix = tfidf.fit_transform(train_iter)


In [51]:
test_doc_matrix = tfidf.transform(test_iter)

In [52]:
scores_dict = cross_validate(train_doc_matrix, train_labels, lr_clfs)
print_clf_scores(scores_dict, logistic_params)

clf with params: [('C', 5)], score: 0.8973199999999999
clf with params: [('C', 0.5)], score: 0.88128
clf with params: [('C', 10)], score: 0.8974000000000002
clf with params: [('C', 1)], score: 0.88892
clf with params: [('C', 0.1)], score: 0.85152
