In [72]:
import pandas as pd
import numpy as np
import re as re

#### 1. read whole dataset

In [73]:
df_all = pd.read_csv('../data/raw/amazon_review.csv')

#### 2. positive or negative

In [74]:
# 1 positive, 0 negative review
def score_round(x):
    if x>=3:
        return 1
    else: 
        return 0

In [75]:
new_df_all = pd.DataFrame({'score': df_all['Score'].apply(score_round), 'text': df_all['Text']})
new_df_all.head()

Unnamed: 0,score,text
0,1,I have bought several of the Vitality canned d...
1,0,Product arrived labeled as Jumbo Salted Peanut...
2,1,This is a confection that has been around a fe...
3,0,If you are looking for the secret ingredient i...
4,1,Great taffy at a great price. There was a wid...


#### 3. clean text

In [76]:
# remove null rows
new_df_all = new_df_all.dropna()
print(f"null rows:\n{new_df_all.isnull().sum()}")

null rows:
score    0
text     0
dtype: int64


In [77]:
# remove duplicates
new_df_all = new_df_all.drop_duplicates()
new_df_all.shape

(393591, 2)

In [78]:
# remove hyperlinks and markup tags
def remove_markup(raw):
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

In [79]:
# remove numbers
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

In [80]:
# remove emojis
def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    result = emoji_pattern.sub(r'', text)
    return result

In [81]:
# unify whitespace
def unify_whitespace(text):
    result = re.sub(' +', ' ', text)
    return result

In [82]:
# remove symbols
def remove_symbols(text):
    result = re.sub(r"[^a-zA-Z0-9?!.,]+", ' ', text)
    return result

In [83]:
# remove punctuation
def remove_punctuation(text):
    result = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"',','))
    return result

In [84]:
# remove stopwords
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    result = ' '.join([i for i in text.split() if not i in stop_words])
    return result

In [85]:
# apply all functions
def preprocess(text):
    text = remove_markup(text)
    text = remove_emojis(text)
    text = text.lower()
    text = remove_numbers(text)
    text = unify_whitespace(text)
    text = remove_symbols(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    return text

In [86]:
new_df_all['text'] = new_df_all['text'].apply(preprocess)

In [87]:
new_df_all.head()

Unnamed: 0,score,text
0,1,bought several vitality canned dog food produc...
1,0,product arrived labeled jumbo salted peanutsth...
2,1,confection around centuries light pillowy citr...
3,0,looking secret ingredient robitussin believe f...
4,1,great taffy great price wide assortment yummy ...


### 4. save to csv

In [101]:
# save all to csv
new_df_all.to_csv('../data/processed/all.csv', index=False)

In [102]:
from sklearn.model_selection import train_test_split

# randomly sample 5000 rows
samples = 5000
df_samples = new_df_all.sample(n=samples, random_state=1)

In [103]:
train = 3000
test = 2000

# small
small_train = 100
small_test = 2000

# big
big_train = 1000
big_test = 2000

In [104]:
# randomly split train 3000, test 2000
train_3k, test_2k = train_test_split(df_samples, test_size=0.4)

train_3k.to_csv(f"../data/processed/train_{train}.csv", index=False)
test_2k.to_csv(f"../data/processed/test_{test}.csv", index=False)

In [105]:
# small train set
train_samll = train_3k.sample(n=100, random_state=1)
test_samll= test_2k

train_samll.to_csv(f"../data/processed/train_{small_train}.csv", index=False)
test_samll.to_csv(f"../data/processed/test_{small_test}.csv", index=False)

In [106]:
# big train set
train_big = train_3k.sample(n=1000, random_state=1)
test_big = test_2k

train_big.to_csv(f"../data/processed/train_{big_train}.csv", index=False)
test_big.to_csv(f"../data/processed/test_{big_test}.csv", index=False)