In [5]:
import sys
!{sys.executable} -m pip install nltk scikit-learn pandas gensim eli5

Collecting vader
  Downloading vader-0.0.2-py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 2.3 MB/s eta 0:00:011
Collecting sonopy
  Downloading sonopy-0.1.2.tar.gz (3.3 kB)
Building wheels for collected packages: sonopy
  Building wheel for sonopy (setup.py) ... [?25ldone
[?25h  Created wheel for sonopy: filename=sonopy-0.1.2-py3-none-any.whl size=2880 sha256=b5bacd97cbab06e61225e1dc5580940180e9cb4ac4276ab672a0e01d7148225e
  Stored in directory: /home/wva/.cache/pip/wheels/1f/82/ee/3e858c78c0734f6fe30ade1bd3ef040c7f45eedae6669e88f8
Successfully built sonopy
Installing collected packages: sonopy, vader
Successfully installed sonopy-0.1.2 vader-0.0.2


In [4]:
# General packages and dictionary analysis
from pathlib import Path
import tarfile
import bz2
import urllib.request
import re
import pickle
import requests
import pandas as pd
from nltk.tokenize import TreebankWordTokenizer

# Supervised text classification
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import joblib
import eli5
import vader

# Topic Modeling
from gensim import matutils
from gensim.models.ldamodel import LdaModel

In [23]:
filename = "reviewdata.pickle.bz2"
if Path(filename).exists():
    print(f"Using cached file {filename}")
    with bz2.BZ2File(filename, 'r') as f:
        X_train, X_test, y_train, y_test = pickle.load(f)
else:
    url = "http://cssbook.net/d/aclImdb_v1.tar.gz"
    print(f"Downloading from {url}")
    fn, _headers = urllib.request.urlretrieve(url, filename=None)
    t = tarfile.open(fn, mode="r:gz")
    X_train, X_test, y_train, y_test = [], [], [], []
    for file in t.getmembers():
        try:
            _imdb, dataset, label, _fn = Path(file.name).parts
        except ValueError:
            # if the Path cannot be parsed, e.g. because it does not consist of exactly four parts, then it is not a part of the dataset but for instance a folder name. Let's skip it then
            continue
        if dataset == "train" and (label=='pos' or label=='neg'):
            X_train.append(t.extractfile(file).read().decode("utf-8"))
            y_train.append(label)
        elif dataset == "test" and (label=='pos' or label=='neg'):
            X_test.append(t.extractfile(file).read().decode("utf-8"))
            y_test.append(label)
    print(f"Saving to {len(y_train)} training and {len(y_test)} test cases to {filename}")
    with bz2.BZ2File(filename, 'w') as f:
        pickle.dump((X_train, X_test, y_train, y_test), f)

Downloading from http://cssbook.net/d/aclImdb_v1.tar.gz
Saving to 25000 training and 25000 test cases to reviewdata.pickle.bz2


In [4]:
positive = set(requests.get('http://cssbook.net/d/positive.txt').text.split('\n'))
negative = set(requests.get('http://cssbook.net/d/negative.txt').text.split('\n'))
sentimentdict = {word:+1 for word in positive}
sentimentdict.update({word:-1 for word in negative})

scores = []
mytokenizer = TreebankWordTokenizer()
# we only take the first 100 reviews to speed things up
for review in X_train[:5]:
    words = mytokenizer.tokenize(review)
    # we look up each word in the sentiment dict and assign its value (if we don't find it, it gets 0)
    scores.append(sum(sentimentdict.get(word,0) for word in words))
scores

[-3, -4, 1, 3, -2]