# Acquire IMDB Data

1. Downloads IMDB movie reviews from ai.stanfor.edu
2. Parses movie reviews from files.
3. Saves a pickled data frame for the reviews.

<u>Information About Data Set</u>

<pre>
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}

References

Potts, Christopher. 2011. On the negativity of negation. In Nan Li and
David Lutz, eds., Proceedings of Semantics and Linguistic Theory 20,
636-659.

Contact

For questions/comments/corrections please contact Andrew Maas
amaas@cs.stanford.edu
</pre>

In [2]:
import os
import re
import urllib.request
import tarfile
import re
import pandas as pd

In [3]:
data_dir = "data"
if not os.path.isdir(data_dir):
    os.makedirs(data_dir)

In [7]:
imdb_data_url  = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
imdb_data_file = f"{data_dir}/aclImdb_v1.tar.gz"

def read_urls_file(tf, name):
    m = tf.getmember(name)
    with tf.extractfile(m) as fh:
        content = fh.read().decode('utf-8')
    return [line.split("/")[4] for line in content.split("\n") if len(line) > 0]

def create_movie_id_lookup(tf):
    movie_id_lookup = {}
    for data_set in ['test', 'train']:
        for label in ['pos', 'neg']:
            entry = f"aclImdb/{data_set}/urls_{label}.txt"
            ids = read_urls_file(tf, entry)
            for i in range(len(ids)):
                movie_id_lookup[(data_set, label, i)] = ids[i]
    return movie_id_lookup

def read_imdb_data():
    member_re = re.compile(r"aclImdb/(test|train)/(pos|neg)/(\d+)_(\d+)\.txt")
    data = []
    with tarfile.open(imdb_data_file, "r") as tf:
        movie_id_lookup = create_movie_id_lookup(tf)
        for member in tf.getmembers():
            if member.isfile():
                m = member_re.match(member.name)
                if m:
                    data_set = m.group(1)
                    label = m.group(2)
                    row_id = int(m.group(3))
                    polarity = 1 if label == "pos" else 0
                    movie_id = movie_id_lookup[(data_set, label, row_id)]
                    with tf.extractfile(member) as fh:
                        content = fh.read().decode('utf-8')
                    data.append([data_set, polarity, content, movie_id])
    return pd.DataFrame(data=data, columns=['data_set', 'polarity', 'sentence', 'movie_id'])

def load_imdb_data():
    if not os.path.isfile(imdb_data_file):
        print(f"Downloading: {imdb_data_url}")
        urllib.request.urlretrieve(imdb_data_url, imdb_data_file)
    return read_imdb_data()

imdb_data = load_imdb_data()

In [8]:
imdb_data.to_pickle(f"{data_dir}/imdb_data.pickle.gz")