In [1]:
!pip3 install nltk scikit-learn pandas gensim eli5 keras

You should consider upgrading via the '/home/damian/onderwijs_github/ccsbook/env/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
# General packages and dictionary analysis
from pathlib import Path
import tarfile
import bz2
import urllib.request
import re
import pickle
import requests
import pandas as pd
from nltk.tokenize import TreebankWordTokenizer

# Supervised text classification
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import joblib
import eli5
from nltk.sentiment import vader

# Deep learning with Keras
from keras.layers import Dense, Input, GlobalMaxPooling1D, Conv1D, Embedding
from keras.models import Model
from keras.optimizers import RMSprop
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

# Topic Modeling
from gensim import matutils
from gensim.models.ldamodel import LdaModel


In [3]:
def get_review_data(filename = "reviewdata.pickle.bz2", url = "http://cssbook.net/d/aclImdb_v1.tar.gz"):
    '''
    Checks whether review dataset has already been downloaded.
    If not, downloads it.
    
    Parameters
    ----------
    filename : string
        name of cached file
    url : string
        url of IMDB dataset
    
    Returns
    -------
    tuple of lists of strings
        reviews_train, reviews_test, label_train, label_test
    '''

    if Path(filename).exists():
        print(f"Using cached file {filename}")
        with bz2.BZ2File(filename, 'r') as f:
            reviews_train, reviews_test, label_train, label_test = pickle.load(f)
    else:
        print(f"Downloading from {url}")
        fn, _headers = urllib.request.urlretrieve(url, filename=None)
        t = tarfile.open(fn, mode="r:gz")
        reviews_train, reviews_test, label_train, label_test = [], [], [], []
        for file in t.getmembers():
            try:
                _imdb, dataset, label, _fn = Path(file.name).parts
            except ValueError:
                # if the Path cannot be parsed, e.g. because it does not consist of exactly four parts, then it is not a part of the dataset but for instance a folder name. Let's skip it then
                continue
            if dataset == "train" and (label=='pos' or label=='neg'):
                reviews_train.append(t.extractfile(file).read().decode("utf-8"))
                label_train.append(label)
            elif dataset == "test" and (label=='pos' or label=='neg'):
                reviews_test.append(t.extractfile(file).read().decode("utf-8"))
                label_test.append(label)
        print(f"Saving {len(label_train)} training and {len(label_test)} test cases to {filename}")
        with bz2.BZ2File(filename, 'w') as f:
            pickle.dump((reviews_train, reviews_test, label_train, label_test), f)
    return reviews_train, reviews_test, label_train, label_test

reviews_train, reviews_test, y_train, y_test = get_review_data()

Using cached file reviewdata.pickle.bz2


In [4]:
positive = set(requests.get('http://cssbook.net/d/positive.txt').text.split('\n'))
negative = set(requests.get('http://cssbook.net/d/negative.txt').text.split('\n'))
sentimentdict = {word:+1 for word in positive}
sentimentdict.update({word:-1 for word in negative})

scores = []
mytokenizer = TreebankWordTokenizer()
# we only take the first 100 reviews to speed things up
for review in reviews_train[:100]:
    words = mytokenizer.tokenize(review)
    # we look up each word in the sentiment dict and assign its value (if we don't find it, it gets 0)
    scores.append(sum(sentimentdict.get(word,0) for word in words))
scores

[-3,
 -4,
 1,
 3,
 -2,
 -7,
 -6,
 9,
 7,
 7,
 10,
 5,
 -1,
 2,
 7,
 -4,
 2,
 21,
 1,
 -1,
 2,
 -3,
 -2,
 -11,
 -2,
 -3,
 -7,
 2,
 4,
 -22,
 5,
 4,
 3,
 -5,
 -8,
 1,
 -1,
 0,
 1,
 8,
 0,
 -4,
 3,
 -7,
 -11,
 -6,
 0,
 3,
 -1,
 0,
 6,
 -1,
 -8,
 7,
 -5,
 2,
 10,
 5,
 5,
 1,
 0,
 7,
 0,
 0,
 5,
 1,
 -8,
 4,
 3,
 18,
 2,
 0,
 -3,
 -2,
 5,
 0,
 -2,
 1,
 1,
 12,
 -3,
 -4,
 -6,
 -2,
 2,
 -7,
 -1,
 -10,
 -5,
 3,
 4,
 -3,
 -17,
 1,
 -1,
 7,
 -3,
 4,
 12,
 3]