In [91]:
import pandas as pd
import numpy as np
import re as re
import gzip
import json

#### 1. read whole dataset

In [92]:
### load the meta data

data = []
with gzip.open('../data/Video_Games_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))

df_all = pd.DataFrame.from_dict(data)

#### 2. positive or negative

In [93]:
# 1 positive, 0 negative review
def score_round(x):
    if x>=3:
        return 1
    else: 
        return 0
    
new_df_all = pd.DataFrame({'score': df_all['overall'].apply(score_round), 'text': df_all['reviewText']})

#### 3. clean text

In [94]:
# remove null rows
new_df_all = new_df_all.dropna()
print(f"null rows:\n{new_df_all.isnull().sum()}")

# remove duplicates
new_df_all = new_df_all.drop_duplicates()
new_df_all.shape

null rows:
score    0
text     0
dtype: int64


(401694, 2)

In [95]:
# remove hyperlinks and markup tags
def remove_markup(raw):
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

# remove numbers
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

# remove emojis
def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    result = emoji_pattern.sub(r'', text)
    return result

# unify whitespace
def unify_whitespace(text):
    result = re.sub(' +', ' ', text)
    return result

# remove symbols
def remove_symbols(text):
    result = re.sub(r"[^a-zA-Z0-9?!.,]+", ' ', text)
    return result

# remove punctuation
def remove_punctuation(text):
    result = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"',','))
    return result

# remove stopwords
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    result = ' '.join([i for i in text.split() if not i in stop_words])
    return result

# apply all functions
def preprocess(text):
    text = remove_markup(text)
    text = remove_emojis(text)
    text = text.lower()
    text = remove_numbers(text)
    # text = unify_whitespace(text)
    text = remove_symbols(text)
    # text = remove_punctuation(text)
    # text = remove_stopwords(text)
    return text

In [96]:
new_df_all['text'] = new_df_all['text'].apply(preprocess)
new_df_all['length'] = new_df_all['text'].apply(lambda x: len(x.split()))
new_df_all = new_df_all[new_df_all.length >= 5]
new_df_all

Unnamed: 0,score,text,length
0,1,"this game is a bit hard to get the hang of, bu...",18
1,1,i played it a while but it was alright. the st...,65
3,0,"found the game a bit too complicated, not what...",17
4,1,"great game, i love it and have played it since...",12
5,1,i liked a lot some time that i haven t play a ...,22
...,...,...,...
497571,1,"this work great and easy to replace, i would n...",24
497572,1,not oem but good replacement parts,6
497574,1,this does add some kids room things that are v...,69
497575,1,i think i originally began playing bioshock se...,776


### 4. save to csv

In [97]:
# save all to csv
new_df_all = new_df_all[['score', 'text']]
new_df_all.to_csv('../data/processed/all.csv', index=False)

In [98]:
random_state = 42

new_df_all_pos = new_df_all[new_df_all.score == 1]
new_df_all_neg = new_df_all[new_df_all.score == 0]

new_df_all_pos_selected = new_df_all_pos.sample(n=2500, random_state=random_state)
new_df_all_neg_selected = new_df_all_neg.sample(n=2500, random_state=random_state)

df_samples = pd.concat([new_df_all_pos_selected, new_df_all_neg_selected])



In [99]:
# Create unsupervised data for UDA

new_df_all_pos_not_selected = new_df_all_pos.drop(new_df_all_pos_selected.index)
new_df_all_neg_not_selected = new_df_all_neg.drop(new_df_all_neg_selected.index)

unsup_neg = new_df_all_neg_not_selected.sample(n=10000, random_state=random_state)
unsup_pos = new_df_all_pos_not_selected.sample(n=10000, random_state=random_state)
unsup = pd.concat([unsup_neg, unsup_pos])
unsup = unsup[['text']]
unsup.to_csv(f"../data/processed/unsup_20000.csv", index=False)
unsup

Unnamed: 0,text
199269,risen would have been a decent game if it came...
120739,tired of a mediocre at best game freezing on m...
101008,the pink is cute but the items look cheap so i...
129690,before you randomly rate this review please ju...
424478,collector here. arrived in really bad conditio...
...,...
124099,"when you said new you meant it, it was still i..."
240219,kick butt game lots of fun.
462868,we all love this orange raging rodent on the p...
42542,the gameboy advance is my favorite gaming devi...


In [100]:
from sklearn.model_selection import train_test_split

train = 3000
# valid = 1000
test = 2000

# small
small_train = 100
small_test = 2000

# big
big_train = 1000
big_test = 2000

# train_set, remaining_set = train_test_split(df_samples, \
#                                             train_size=train, random_state=random_state)
# val_set, test_set = train_test_split(remaining_set, \
#                                      train_size=valid, test_size=test, random_state=random_state)

# randomly split train 3000, test 2000
train_3k, test_2k = train_test_split(df_samples, \
                                            train_size=train, random_state=random_state)

train_3k.to_csv(f"../data/processed/train_{train}.csv", index=False)
test_2k.to_csv(f"../data/processed/test_{test}.csv", index=False)

In [101]:
# small train set
train_samll = train_3k.sample(n=100, random_state=random_state)
test_samll= test_2k

train_samll.to_csv(f"../data/processed/train_{small_train}.csv", index=False)
test_samll.to_csv(f"../data/processed/test_{small_test}.csv", index=False)

In [102]:
# big train set
train_big = train_3k.sample(n=1000, random_state=random_state)
test_big = test_2k

train_big.to_csv(f"../data/processed/train_{big_train}.csv", index=False)
test_big.to_csv(f"../data/processed/test_{big_test}.csv", index=False)