# ReadExtractDataIMDBReviews

Download the IMDB dataset and perform basic preparation.

In [1]:
import os
import re
import urllib
import tarfile
import sqlite3

import pandas as pd

## Download The Dataset

In [2]:
os.makedirs("var", exist_ok=True)

In [3]:
data_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
local_file = "var/aclImdb_v1.tar.gz"

In [4]:
if not(os.path.isfile(local_file)):
    urllib.request.urlretrieve(data_url, local_file)

## Dataset Reading Functions

In [5]:
def read_text(tf, name):
    with tf.extractfile(name) as f:
        return f.read().decode('utf-8')

def read_readme(tf):
    return read_text(tf, "aclImdb/README")

def read_comment_links(tf, name):
    comments = []
    with tf.extractfile(name) as f:
        for line in f:
            comments.append(line.decode('utf-8').strip())
    return comments

def read_all_comment_links(tf):
    dfs = []
    names = tf.getnames()
    for ds_type in ['train', 'test']:
        for _class in ['pos', 'neg', 'unsup']:
            fname = f"aclImdb/{ds_type}/urls_{_class}.txt"
            if fname in names:
                links = read_comment_links(tf, fname)
                links_df = pd.DataFrame({'commentLink': links})
                links_df['dataset'] = ds_type
                links_df['class'] = _class
                links_df['datasetclassId'] = range(0, len(links_df))
                dfs.append(links_df)
            
    return pd.concat(dfs)

def get_review_names(tf):
    name_re = re.compile(r'^aclImdb/(test|train)/(pos|neg|unsup)/(\d+)_(\d+).txt$')

    entries = []
    for name in tf.getnames():
        m = name_re.match(name)
        if m:
            ds_type = m.group(1)
            _class = m.group(2)
            _id = int(m.group(3))
            rating = int(m.group(4))
            entries.append({'dataset': ds_type, 'class': _class, 'datasetclassId': _id, 'rating': rating, 'name': name})

    return pd.DataFrame(entries)

def read_reviews(tf):
    reviews = get_review_names(tf)
    reviews['review'] = reviews['name'].apply(lambda name: read_text(tf, name))
    return reviews

## Open Handle to Tarfile

In [6]:
tf = tarfile.open(local_file)

### Readme

In [7]:
print(read_readme(tf))

Large Movie Review Dataset v1.0

Overview

This dataset contains movie reviews along with their associated binary
sentiment polarity labels. It is intended to serve as a benchmark for
sentiment classification. This document outlines how the dataset was
gathered, and how to use the files provided. 

Dataset 

The core dataset contains 50,000 reviews split evenly into 25k train
and 25k test sets. The overall distribution of labels is balanced (25k
pos and 25k neg). We also include an additional 50,000 unlabeled
documents for unsupervised learning. 

In the entire collection, no more than 30 reviews are allowed for any
given movie because reviews for the same movie tend to have correlated
ratings. Further, the train and test sets contain a disjoint set of
movies, so no significant performance is obtained by memorizing
movie-unique terms and their associated with observed labels.  In the
labeled train/test sets, a negative review has a score <= 4 out of 10,
and a positive review has a scor

### Load Comment Links

In [8]:
comment_links = read_all_comment_links(tf)

In [9]:
comment_links['class'].value_counts().to_frame()

Unnamed: 0,class
unsup,50000
pos,25000
neg,25000


In [10]:
comment_links.sample(5)

Unnamed: 0,commentLink,dataset,class,datasetclassId
45128,http://www.imdb.com/title/tt0091214/usercomments,train,unsup,45128
2637,http://www.imdb.com/title/tt0167284/usercomments,test,neg,2637
23741,http://www.imdb.com/title/tt0969706/usercomments,train,unsup,23741
5790,http://www.imdb.com/title/tt0734686/usercomments,train,pos,5790
18768,http://www.imdb.com/title/tt0074483/usercomments,train,unsup,18768


### Load Reviews

In [11]:
reviews = read_reviews(tf)

In [12]:
reviews.groupby(['dataset', 'class']).size().reset_index()

Unnamed: 0,dataset,class,0
0,test,neg,12500
1,test,pos,12500
2,train,neg,12500
3,train,pos,12500
4,train,unsup,50000


In [13]:
reviews.sample(5)

Unnamed: 0,dataset,class,datasetclassId,rating,name,review
8999,test,neg,9048,4,aclImdb/test/neg/9048_4.txt,"""Eaten Alive"" goes down much easier than Rugge..."
86666,train,unsup,36677,0,aclImdb/train/unsup/36677_0.txt,This movie has everything wrong with it: overd...
34483,train,neg,9588,4,aclImdb/train/neg/9588_4.txt,"""Absolute Beginners"" was a film for the younge..."
76,test,neg,51,4,aclImdb/test/neg/51_4.txt,One of the last surviving horror screen greats...
30997,train,neg,5906,1,aclImdb/train/neg/5906_1.txt,"This movie was terrible. The plot sucked, the ..."


### Final Prep and Join Comments Links with Reviews

In [15]:
ttid_re = re.compile(r'title/([^/]+)')
                     
def get_ttid_re(comment_link):
    m = ttid_re.search(comment_link)
    if m:
        return m.group(1)

reviews_with_comments = pd.merge(
    reviews,
    comment_links
)

# Assign a pk
reviews_with_comments['id'] = range(1, len(reviews_with_comments) + 1)

# Extract the title id
reviews_with_comments['titleId'] = reviews_with_comments.commentLink.apply(get_ttid_re)

# Drop some columns and reorde columns
reviews_with_comments = (reviews_with_comments
    [['id', 'titleId', 'dataset', 'class', 'datasetclassId', 'rating', 'review']]
)

In [16]:
reviews_with_comments.sample(5)

Unnamed: 0,id,titleId,dataset,class,datasetclassId,rating,review
43728,43729,tt0025529,train,pos,6187,9,There are moments in the film that are so drea...
22691,22692,tt0049470,test,pos,10160,7,Ostensibly a story about the young child of Ji...
41661,41662,tt0080772,train,pos,4158,10,What can I say ? An action and allegorical tal...
4898,4899,tt0816539,test,neg,4957,2,Jack Brooks (Trevor Matthews) is a college stu...
53765,53766,tt0096163,train,unsup,3786,0,"""The Vanishing"", or ""Spoorloos"" (its original ..."


## Write Data Out to Sqlite

In [17]:
db = sqlite3.connect("var/reviews.db")

In [18]:
(reviews_with_comments
    .to_sql('reviews', if_exists='replace', index=False, chunksize=5000, con=db)
)

In [19]:
db.commit()

In [20]:
db.execute("create unique index i_reviews_pk on reviews (id)")

<sqlite3.Cursor at 0x2807b09e8f0>

In [21]:
db.execute("create index i_reviews_title_id on reviews (titleId)")

<sqlite3.Cursor at 0x2807b09ec00>

In [22]:
db.commit()

In [23]:
db.close()