In [5]:
import numpy as np
import pandas as pd
import json
import re
import string
from datasets import load_dataset

## Text Preprocessing

In [2]:
from bs4 import BeautifulSoup as bs
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
import nltk
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/biolab/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/biolab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/biolab/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
train_dataset = pd.read_csv('../data/train.csv', header=0)
train_dataset.head()

Unnamed: 0,ID,review,sentiment
0,41411,I watched this film because I'm a big fan of R...,0
1,37586,It does not seem that this movie managed to pl...,1
2,6017,"Enough is not a bad movie , just mediocre .",0
3,44656,my friend and i rented this one a few nights a...,0
4,38711,"Just about everything in this movie is wrong, ...",0


In [4]:
predict_dataset = pd.read_csv('../data/test.csv',  header=0)
predict_dataset.head()

Unnamed: 0,ID,review
0,22622,Robert Lansing plays a scientist experimenting...
1,10162,"Well I've enjoy this movie, even though someti..."
2,17468,First things first - though I believe Joel Sch...
3,42579,I watched this movie on the grounds that Amber...
4,701,A certain sexiness underlines even the dullest...


In [6]:
imdb_datasets = load_dataset("imdb")
imdb_datasets

Reusing dataset imdb (/home/biolab/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [96]:
lemmatizer = WordNetLemmatizer()

In [9]:
# 去除字尾
def StemProcess(word):
    lemma = lemmatizer.lemmatize(word, 'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word, 'n')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word, 'a')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word, 'r')
    return lemma

In [10]:
def removePunctuation(sentence):
    return ' '.join(list(filter(lambda word: word not in string.punctuation, sentence)))

In [11]:
def extractPartial(text):
    text = bs(text, 'html.parser').text
#     text = re.findall(r"[\w']+|[.,!?;]", text)
#     text = sent_tokenize(text)
#     text = list(map(lambda word: StemProcess(word), text))
#     text = list(filter(lambda word: word not in stopwords.words('english'), text))
#     text = list(map(lambda sentence: re.findall(r"[\w']+|[.,!?;]", sentence), text))
#     text = list(map(lambda sentence: removePunctuation(sentence), text))
#     text = ' [SEP] '.join(text)
    return text

In [12]:
N = 1000

In [13]:
def processReviews(texts):
    results = []
    count = 0
    for text in texts:
        count += len(text)
        results += list(map(lambda x: extractPartial(x), text))
        print("Finish {count} texts...".format(count=count))
    max_len = len(max(results, key=len))
    print("dataset review shape = ({0}, {1})".format(len(results), max_len))
    return results

### For training dataset

In [84]:
train_texts = [train_dataset['review'][i: i+N] for i in range(0, len(train_dataset['review']), N)]
train_results = processReviews(train_texts)

Finish 1000 texts...
Finish 2000 texts...
Finish 3000 texts...
Finish 4000 texts...
Finish 5000 texts...
Finish 6000 texts...
Finish 7000 texts...
Finish 8000 texts...
Finish 9000 texts...
Finish 10000 texts...
Finish 11000 texts...
Finish 12000 texts...
Finish 13000 texts...
Finish 14000 texts...
Finish 15000 texts...
Finish 16000 texts...
Finish 17000 texts...
Finish 18000 texts...
Finish 19000 texts...
Finish 20000 texts...
Finish 21000 texts...
Finish 22000 texts...
Finish 23000 texts...
Finish 24000 texts...
Finish 25000 texts...
Finish 26000 texts...
Finish 27000 texts...
Finish 28000 texts...
Finish 29000 texts...
Finish 29341 texts...
dataset review shape = (29341, 12804)


In [85]:
with open('../data/sentiments_beautify_html.json', 'w') as file:
    json.dump(train_results, file)

### For predict dataset

In [86]:
predict_texts = [predict_dataset['review'][i: i+N] for i in range(0, len(predict_dataset['review']), N)]
predict_results = processReviews(predict_texts)

Finish 1000 texts...
Finish 2000 texts...
Finish 3000 texts...
Finish 4000 texts...
Finish 5000 texts...
Finish 6000 texts...
Finish 7000 texts...
Finish 8000 texts...
Finish 9000 texts...
Finish 10000 texts...
Finish 11000 texts...
Finish 12000 texts...
Finish 13000 texts...
Finish 14000 texts...
Finish 15000 texts...
Finish 16000 texts...
Finish 17000 texts...
Finish 18000 texts...
Finish 19000 texts...
Finish 20000 texts...
Finish 21000 texts...
Finish 22000 texts...
Finish 23000 texts...
Finish 24000 texts...
Finish 25000 texts...
Finish 26000 texts...
Finish 27000 texts...
Finish 28000 texts...
Finish 29000 texts...
Finish 29341 texts...
dataset review shape = (29341, 14212)


In [87]:
with open('../data/predicts_beautify_html.json', 'w') as file:
    json.dump(predict_results, file)

### For IMDB datasets

In [14]:
imdb_train_texts = [imdb_datasets['train']['text'][i: i+N] for i in range(0, len(imdb_datasets['train']['text']), N)]
imdb_train_results = processReviews(imdb_train_texts)

Finish 1000 texts...
Finish 2000 texts...
Finish 3000 texts...
Finish 4000 texts...
Finish 5000 texts...
Finish 6000 texts...
Finish 7000 texts...
Finish 8000 texts...
Finish 9000 texts...
Finish 10000 texts...
Finish 11000 texts...
Finish 12000 texts...
Finish 13000 texts...
Finish 14000 texts...
Finish 15000 texts...
Finish 16000 texts...
Finish 17000 texts...
Finish 18000 texts...
Finish 19000 texts...
Finish 20000 texts...
Finish 21000 texts...
Finish 22000 texts...
Finish 23000 texts...
Finish 24000 texts...
Finish 25000 texts...
dataset review shape = (25000, 13584)


In [15]:
imdb_test_texts = [imdb_datasets['test']['text'][i: i+N] for i in range(0, len(imdb_datasets['test']['text']), N)]
imdb_test_results = processReviews(imdb_test_texts)

Finish 1000 texts...
Finish 2000 texts...
Finish 3000 texts...
Finish 4000 texts...
Finish 5000 texts...
Finish 6000 texts...
Finish 7000 texts...
Finish 8000 texts...
Finish 9000 texts...
Finish 10000 texts...
Finish 11000 texts...
Finish 12000 texts...
Finish 13000 texts...
Finish 14000 texts...
Finish 15000 texts...
Finish 16000 texts...
Finish 17000 texts...
Finish 18000 texts...
Finish 19000 texts...
Finish 20000 texts...
Finish 21000 texts...
Finish 22000 texts...
Finish 23000 texts...
Finish 24000 texts...
Finish 25000 texts...
dataset review shape = (25000, 12690)


In [16]:
with open('../data/imdb_train_beautify_html.json', 'w') as file:
    json.dump(imdb_train_results, file)

In [17]:
with open('../data/imdb_test_beautify_html.json', 'w') as file:
    json.dump(imdb_test_results, file)