In [1]:
import pandas as pd
import numpy as np

Load old data

In [2]:
old_train = pd.read_csv("./20220420_amazon_reviews_train.csv")
old_valid = pd.read_csv("./20220420_amazon_reviews_valid.csv")
old_test = pd.read_csv("./20220420_amazon_reviews_test.csv")

Combine train and valid sets

In [3]:
old_data = pd.concat([old_train, old_valid])
old_data.shape

(15115, 7)

In [4]:
old_data.head()

Unnamed: 0,category,prod_id,rating,polarity,review,review_len,review_id
0,AMAZON_FASHION,B001LFP0EO,2.0,negative,The bandit wouldn't stay put...at all. How co...,88,0
1,AMAZON_FASHION,B001LFP0EO,5.0,positive,I am wearing my Bandit now. It made my belly g...,39,1
2,AMAZON_FASHION,B001LFP0EO,1.0,negative,i was so excited to get to order this while i ...,189,2
3,AMAZON_FASHION,B001LFP0EO,5.0,positive,I was very skeptical about purchasing this but...,144,3
4,AMAZON_FASHION,B001LFP0EO,1.0,negative,I ordered the XL in the Belly Bandit because t...,112,4


In [5]:
old_data["category"].value_counts()

Software                      962
Home_and_Kitchen              920
Patio_Lawn_and_Garden         905
Pet_Supplies                  862
Sports_and_Outdoors           860
Musical_Instruments           858
Tools_and_Home_Improvement    854
Books                         847
Electronics                   815
Arts_Crafts_and_Sewing        807
Automotive                    803
All_Beauty                    802
Industrial_and_Scientific     801
Office_Products               800
CDs_and_Vinyl                 563
Grocery_and_Gourmet_Food      539
AMAZON_FASHION                352
Luxury_Beauty                 306
Prime_Pantry                  303
Clothing_Shoes_and_Jewelry    276
Toys_and_Games                254
Video_Games                   229
Movies_and_TV                 224
Appliances                    173
Name: category, dtype: int64

Extract few categories

In [6]:
old_data = old_data.loc[old_data["category"].apply(lambda x: str(x) in ["Software", "Pet_Supplies", "All_Beauty"])]
old_data.shape

(2626, 7)

Load new data

In [7]:
new_train = pd.read_csv("./20221204_amazon_reviews_train.csv")
new_valid = pd.read_csv("./20221204_amazon_reviews_valid.csv")

Combine train and valid sets

In [8]:
new_data = pd.concat([new_train, new_valid])
new_data.shape

(17376, 7)

In [9]:
new_data.head()

Unnamed: 0,category,prod_id,rating,polarity,review,review_len,review_id
0,Cell_Phones_and_Accessories,B00009PGN0,4.0,positive,"I just got this, so maybe I haven't given it a...",104,0
1,Cell_Phones_and_Accessories,B00009PGN0,3.0,neutral,this is not really an out standing phone.that ...,36,1
2,Cell_Phones_and_Accessories,B00009PGN0,5.0,positive,my dad and grandfather have this phone have th...,162,2
3,Cell_Phones_and_Accessories,B00009PGN0,2.0,negative,I've had this phone for an extremely unfortuna...,97,3
4,Cell_Phones_and_Accessories,B00009PGN0,1.0,negative,I had this phone for about 5 days before I wan...,87,4


Identify old test data prod_id

In [10]:
old_test_pid = old_test["prod_id"].unique()

Remove old test prod_ids from new_data

In [11]:
new_data = new_data[new_data["prod_id"].apply(lambda x: str(x) not in old_test_pid)]
new_data.shape

(11490, 7)

Combine old and new data

In [12]:
data = pd.concat([old_data, new_data])
data.shape

(14116, 7)

In [21]:
(962+862+802) / data.shape[0]

0.18603003683763106

Show stats

In [13]:
data["category"].value_counts()

Cell_Phones_and_Accessories    7098
Kindle_Store                   4392
Software                        962
Pet_Supplies                    862
All_Beauty                      802
Name: category, dtype: int64

Split into train and valid sets

In [22]:
train = None
valid = None
for catg in data["category"].unique():
    prod_ids = pd.Series(data.loc[data["category"] == catg, "prod_id"].unique()).to_frame().sample(frac=1).values.flatten()
    size = prod_ids.shape[0]
    valid_revs = data.loc[data["prod_id"].apply(lambda x: str(x) in prod_ids[:int(np.ceil(size*0.2))])]
    train_revs = data.loc[data["prod_id"].apply(lambda x: str(x) in prod_ids[int(np.ceil(size*0.2)):])]
    assert valid_revs["prod_id"].unique().shape[0] + train_revs["prod_id"].unique().shape[0] == size
    train = pd.concat([train, train_revs])
    valid = pd.concat([valid, valid_revs])

train.shape, valid.shape

((11096, 7), (3020, 7))

In [23]:
train.head()

Unnamed: 0,category,prod_id,rating,polarity,review,review_len,review_id
281,All_Beauty,B005U5KU46,5.0,positive,"These are very thin, pull off easily without r...",45,0
282,All_Beauty,B005U5KU46,1.0,negative,sweat went right thought within almost 2 hrs o...,14,1
283,All_Beauty,B005U5KU46,2.0,negative,So far they haven't really worked - won't stic...,15,2
284,All_Beauty,B005U5KU46,5.0,positive,"I have tried several brands of shields, and mo...",46,3
285,All_Beauty,B005U5KU46,1.0,negative,They do not stick well to the fabric. Bunch up...,12,4


In [24]:
valid.head()

Unnamed: 0,category,prod_id,rating,polarity,review,review_len,review_id
323,All_Beauty,B00G6S8UY8,4.0,positive,Great fun product. Looking forward to the nex...,16,0
324,All_Beauty,B00G6S8UY8,1.0,negative,What a waste of money..this product is waaaaay...,114,1
325,All_Beauty,B00G6S8UY8,2.0,negative,I wouldn't recommend this product. It's just ...,13,2
326,All_Beauty,B00G6S8UY8,2.0,negative,"pens work beautifull, but after 3 days someti...",38,3
327,All_Beauty,B00G6S8UY8,2.0,negative,I was disappointed with these. They're basical...,39,4


Save data

In [28]:
train.to_csv("./20221204_amazon_reviews_train.csv", index=False)
valid.to_csv("./20221204_amazon_reviews_valid.csv", index=False)
old_test.to_csv("./20221204_amazon_reviews_test.csv", index=False)