In [1]:
!pip3 install nltk scikit-learn pandas 
!pip3 install gensim eli5 keras tensorflow



In [4]:
# General packages and dictionary analysis
import os
import tarfile
import bz2
import urllib.request
import re
import pickle
import requests
import pandas as pd
from nltk.tokenize import TreebankWordTokenizer
import matplotlib.pyplot as plt

# Supervised text classification
from sklearn.feature_extraction.text import (
    CountVectorizer, TfidfVectorizer)
from sklearn.linear_model import (
    LogisticRegression)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import (
    make_pipeline, Pipeline)
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import joblib
import eli5
from nltk.sentiment import vader

# Deep learning with Keras
from keras.layers import (Dense, Input, 
    GlobalMaxPooling1D, Conv1D, Embedding)
from keras.models import Model
from keras.optimizers import RMSprop
from keras.preprocessing.sequence import (
    pad_sequences)
from keras.preprocessing.text import Tokenizer
from gensim.models.keyedvectors import (
    KeyedVectors)

# Topic Modeling
from gensim import matutils
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import (
    CoherenceModel)


In [18]:
filename = "reviewdata.pickle.bz2"
if os.path.exists(filename):
  print(f"Using cached file {filename}")
  with bz2.BZ2File(filename, "r") as zipfile:
    data = pickle.load(zipfile)
    text_train, text_test, y_train, y_test = data
else:
  url = "https://cssbook.net/d/aclImdb_v1.tar.gz"
  print(f"Downloading from {url}")
  fn, _headers = urllib.request.urlretrieve(url, 
                     filename=None)
  t = tarfile.open(fn, mode="r:gz")
  text_train,text_test = [], []
  y_train, y_test = [], []
  for f in t.getmembers():
    m=re.match("aclImdb/(\w+)/(pos|neg)/", f.name)
    if not m:
        # skip folder names, other categories
        continue
    dataset, label = m.groups()
    text = t.extractfile(f).read().decode("utf-8")
    if dataset == "train":
      text_train.append(text)
      y_train.append(label)
    elif dataset == "test":
      text_test.append(text)
      y_test.append(label)
  print(f"Saving to {filename}")
  with bz2.BZ2File(filename, "w") as zipfile:
    data = text_train, text_test, y_train, y_test
    pickle.dump(data, zipfile)

Downloading from https://cssbook.net/d/aclImdb_v1.tar.gz
Saving to reviewdata.pickle.bz2
25000


In [16]:
poswords = "https://cssbook.net/d/positive.txt"
negwords = "https://cssbook.net/d/negative.txt"
pos = set(requests.get(poswords).text.split("\n"))
neg = set(requests.get(negwords).text.split("\n"))
sentimentdict = {word:+1 for word in pos}
sentimentdict.update({word:-1 for word in neg})

scores = []
mytokenizer = TreebankWordTokenizer()
# For speed, we only take the first 100 reviews
for review in text_train[:100]:
    words = mytokenizer.tokenize(review)
    # we look up each word in the sentiment dict 
    # and assign its value (with default 0)
    scores.append(sum(sentimentdict.get(word,0) 
                      for word in words))
scores

[-3,
 -4,
 1,
 3,
 -2,
 -7,
 -6,
 9,
 7,
 7,
 10,
 5,
 -1,
 2,
 7,
 -4,
 2,
 21,
 1,
 -1,
 2,
 -3,
 -2,
 -11,
 -2,
 -3,
 -7,
 2,
 4,
 -22,
 5,
 4,
 3,
 -5,
 -8,
 1,
 -1,
 0,
 1,
 8,
 0,
 -4,
 3,
 -7,
 -11,
 -6,
 0,
 3,
 -1,
 0,
 6,
 -1,
 -8,
 7,
 -5,
 2,
 10,
 5,
 5,
 1,
 0,
 7,
 0,
 0,
 5,
 1,
 -8,
 4,
 3,
 18,
 2,
 0,
 -3,
 -2,
 5,
 0,
 -2,
 1,
 1,
 12,
 -3,
 -4,
 -6,
 -2,
 2,
 -7,
 -1,
 -10,
 -5,
 3,
 4,
 -3,
 -17,
 1,
 -1,
 7,
 -3,
 4,
 12,
 3]