In [1]:
!pip install bs4 --quiet
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from tqdm.notebook import tqdm

import json
import re
import os
import multiprocessing as mp

## Settings
Some settings for preprocessing:
- `STEMMING`: boolean - whether to apply words [stemming](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html) or not. This can improve model quality but takes a much more time
- `POSITIVE_THRESHOLD`: int - a minimal rating that will be interpreted as positive sentiment
- `NEGATIVE_THRESHOLD`: int - a maximal rating that will be interpreted as negative sentiment

E.g if `POSITIVE_THRESHOLD=7` and `NEGATIVE_THRESHOLD=4` then 7,8,9,10 will be interpreted as positive, 4,3,2,1 - as negative, and 5,6 - ignored

In [2]:
STEMMING = True
POSITIVE_THRESHOLD = 7
NEGATIVE_THRESHOLD = 4

## Preprocessing
We need to parse JSON, create labels, preprocess raw reviews and write to file in [Vowpal Wabbit input format](https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Input-format)

In [3]:
stops = set(stopwords.words("english"))
stemmer = PorterStemmer()

def preprocess_review(raw_review):
    # Remove HTML
    review_text = BeautifulSoup(raw_review,).get_text()
    # Remove URLs
    review_text = re.sub("https?:\/\/[\w+.\/]+", " ", review_text)
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()
    # Remove stop words (and stem others if needed)
    if STEMMING:
        meaningful_words = [stemmer.stem(w) for w in words if not w in stops]
    else:
        meaningful_words = [w for w in words if not w in stops]
        
    return(" ".join( meaningful_words))


def rating_to_binary_sentiment(rating):
    if rating is None:
        return None
    rating = int(rating)
    if rating >= POSITIVE_THRESHOLD:
        return 1
    elif rating <= NEGATIVE_THRESHOLD:
        return -1
    return None

In [4]:
%%time
directory = '/kaggle/input/d/ebiswas/imdb-review-dataset/'
filenames = [directory + f'part-0{i}.json' for i in range(1, 7)]

output_dir = '/kaggle/temp/'
try:
    os.mkdir(output_dir)
except FileExistsError:
    pass
    

def preprocess_json(filename):
    new_filename = output_dir + filename.split('/')[-1].split('.')[0] + '.vw'
    with open(new_filename, 'w') as fout, open(filename) as file:
        j = json.load(file)
        print(filename)
        for review in tqdm(j):
            sentiment = rating_to_binary_sentiment(review['rating'])
            if sentiment is None:
                continue
            review_text = preprocess_review(review['review_detail'])
            if review_text=="":
                continue
            fout.write(f'{sentiment} |text {review_text}\n')
        del j

# Multiprocessing is required to free the RAM from large unused files
# As a bonus, we speed up preprocessing with parallelism
max_processes = 3  # 4 processes requires more than 16GB RAM
num_of_pools = (len(filenames) + max_processes - 1) // max_processes
for i in range(num_of_pools):
    idx = max_processes*i
    with mp.Pool(processes=max_processes) as pool:
        pool.map(preprocess_json, filenames[idx: idx+max_processes])

/kaggle/input/d/ebiswas/imdb-review-dataset/part-01.json


  0%|          | 0/1010293 [00:00<?, ?it/s]

/kaggle/input/d/ebiswas/imdb-review-dataset/part-02.json


  0%|          | 0/1012212 [00:00<?, ?it/s]

/kaggle/input/d/ebiswas/imdb-review-dataset/part-03.json


  0%|          | 0/1015000 [00:00<?, ?it/s]



/kaggle/input/d/ebiswas/imdb-review-dataset/part-06.json


  0%|          | 0/499997 [00:00<?, ?it/s]

/kaggle/input/d/ebiswas/imdb-review-dataset/part-04.json


  0%|          | 0/1019000 [00:00<?, ?it/s]

/kaggle/input/d/ebiswas/imdb-review-dataset/part-05.json


  0%|          | 0/1014997 [00:00<?, ?it/s]

CPU times: user 3min 24s, sys: 2min 3s, total: 5min 27s
Wall time: 2h 17min 20s


Bash utils are highly optimized and very efficient for large files, so let's take advantage of them.

In [5]:
# count the number of lines
!wc -l $output_dir/part-*.vw
# combine into one file
!cat $output_dir/part-*.vw >$output_dir/combined.vw
!wc -l $output_dir/combined.vw

    825607 /kaggle/temp//part-01.vw
    807935 /kaggle/temp//part-02.vw
    758304 /kaggle/temp//part-03.vw
    749132 /kaggle/temp//part-04.vw
    778950 /kaggle/temp//part-05.vw
    266601 /kaggle/temp//part-06.vw
   4186529 total
4186529 /kaggle/temp//combined.vw


Now we need to drop duplicates, it will be done with Bash too.

I used this [solution](https://unix.stackexchange.com/a/30181), the idea is as follows: we can easily leave only unique lines in sorted file (`uniq`, `sort -u`), but to keep original order, we add to each line it number first, then sort a file based on content after line number, drop duplicates (they are adjacent now), then restore order with sorting by line numbers.

In [6]:
%%time
%%bash
</kaggle/temp/combined.vw nl -b a -s : |  # number the lines
sort -t : -k 2 -u |                       # sort and uniquify ignoring the line numbers
sort -t : -k 1n |                         # sort according to the line numbers
cut -d : -f 2- >train.vw                  # remove the line numbers

CPU times: user 22 ms, sys: 33 ms, total: 55 ms
Wall time: 3min 26s


In [7]:
!head train.vw --lines=5

1 |text enjoy first season must say think season even stronger ricki great job writer actor director bring best superb support cast one thing chang like hear talk less peopl speak third person pretti hard fault funni yet emot comedi
1 |text except k k actor look comfort act fight scene funni plot repeatt
1 |text guess year old white woman target demograph enjoy show good see other perspect love wri humor entertain offic park rec make broad neg assumpt white though
-1 |text truth much movi suck high rate overli posit review must submit cast film crew give technic analysi like ameteur movi critic sincer opinion drove annoy jump due abl hear open dialogu ambient sound time difficult hear said turn caption knew drove start neg balanc realli good thing happen move forward win film beauti mani natur scene appear low budget standpoint also decent pace fall asleep close occas thing state cinematograph pace could save film right drove fall apart stori humbl opinion premis interest stori well

In [8]:
total_len = !cat train.vw | wc -l
positive = !grep -w "^1 |.*" -c train.vw
negative = !grep -w "^\-1 |.*" -c train.vw
total_len = int(total_len[0])
positive = int(positive[0])
negative = int(negative[0])
print("\Resulting\n"
      f"Total number of reviews: {total_len}\n"
      f"Positive(1):  {positive}\t {positive/total_len:0.5f}\n"
      f"Negative(-1): {negative}\t {negative/total_len:0.5f}\n"
      )

\Resulting
Total number of reviews: 4146096
Positive(1):  2988017	 0.72068
Negative(-1): 1158079	 0.27932



Notice that resulting dataset is somewhat unbalanced