In [25]:
from datasets import load_dataset
import pandas as pd
import numpy as np

<h1>Amazon reviews</h1>

In [26]:
amazon_data = load_dataset("mteb/amazon_polarity")

Using custom data configuration mteb--amazon_polarity-f33df51a02e26041
Reusing dataset json (/home/diego/.cache/huggingface/datasets/json/mteb--amazon_polarity-f33df51a02e26041/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/2 [00:00<?, ?it/s]

In [27]:
print(f"Total length of polarity amazon test {len(amazon_data['test'])}")
print(f"Total length of polarity amazon train {len(amazon_data['train'])}")

Total length of polarity amazon test 400000
Total length of polarity amazon train 3600000


In [28]:
amazon_data["test"]

Dataset({
    features: ['label', 'text', 'label_text'],
    num_rows: 400000
})

We will only consider the Amazon test set to proceed

In [29]:
data = amazon_data['test']
text = [i for i in data["text"]]
labels = [i for i in data["label"]]
d = {"text": text,
     "labels": labels}
amazon_data = pd.DataFrame(data=d)
amazon_data["sentiment"] = amazon_data.apply(lambda row: "positive" if row['labels'] else "negative", axis=1)
amazon_data.head(2)

Unnamed: 0,text,labels,sentiment
0,Great CD\n\nMy lovely Pat has one of the GREAT...,1,positive
1,One of the best game music soundtracks - for a...,1,positive


randomly sample 30k instances. Filter out long and short reviews.

In [30]:
# sample records
df_amazon = amazon_data.sample(n=30000, replace=False)

# filter out reviews longer than 2000 and shorter than 10 words
texts = df_amazon["text"].values
df_amazon["review_len"] = [len(el.split(" ")) for el in texts]
df_amazon = df_amazon[(df_amazon["review_len"] <= 2000) & (df_amazon["review_len"] >= 10)].copy()

lens = [len(el) for el in df_amazon["text"].values]
print(f"Max len: {np.max(df_amazon['review_len'].values)}")
print(f"Mean len: {np.mean(df_amazon['review_len'].values)}")
print(f"Min len: {np.min(df_amazon['review_len'].values)}")

print(f"df len:{len(df_amazon)}")
df_amazon["labels"].value_counts()

Max len: 211
Mean len: 77.27396666666667
Min len: 13
df len:30000


1    15080
0    14920
Name: labels, dtype: int64

In [31]:
df_pos = df_amazon[df_amazon["labels"] == 1].copy()
df_neg = df_amazon[df_amazon["labels"] == 0].copy()

## Sample train-test split 7.5k-1k

In [32]:
seed = 2023

# shuffle data
df_pos = df_pos.sample(frac = 1, replace=False)
df_pos.reset_index(inplace=True)
df_neg = df_neg.sample(frac = 1, replace=False)
df_neg.reset_index(inplace=True)

# sample positive and negative labels
data_pos = df_pos.sample(n=4250, replace=False, random_state=seed)
data_neg = df_neg.sample(n=4250, replace=False, random_state=seed)


# sample train-test split positive labels
test_pos = data_pos.sample(n=500, replace=False, random_state=seed)
train_pos = data_pos[~data_pos.index.isin(test_pos.index)]

# sample train-test split negative labels
test_neg = data_neg.sample(n=500, replace=False, random_state=seed)
train_neg = data_neg[~data_neg.index.isin(test_neg.index)]

# build train and test
testset = pd.concat([test_neg, test_pos])
trainset = pd.concat([train_neg, train_pos])

print()
print(f"len test: {len(testset)}")
print(f"len train: {len(trainset)}")


len test: 1000
len train: 7500


In [33]:
testset.to_csv("../amazon/test.csv", sep="\t", index=False)
trainset.to_csv("../amazon/train.csv", sep="\t", index=False)

In [34]:
def produce_datasets(n, m, out_dir):
    train_set = pd.read_csv("../amazon/train.csv", sep="\t")
    print(len(train_set))

    df_origin = train_set.sample(n=n+m, replace=False, random_state=seed)
    df_n = df_origin.sample(n=n, replace=False, random_state=seed)
    df_seed = df_n.sample(n=m, replace=False, random_state=seed)

    print(f"len origin: {len(df_origin)}")
    print(f"len n_data: {len(df_n)}")
    print(f"len seed: {len(df_seed)}")

    df_origin = trainset.sample(n=n+m, replace=False, random_state=seed)
    df_n = df_origin.sample(n=n, replace=False, random_state=seed)
    df_seed = df_n.sample(n=m, replace=False, random_state=seed)

    print(f"len origin: {len(df_origin)}")
    print(f"len n_data: {len(df_n)}")
    print(f"len seed: {len(df_seed)}")

    df_seed["label_counter"] = [int(not el) for el in df_seed["labels"]]
    df_seed["sentiment_counter"] = ["positive" if x=="negative" else "negative" for x in df_seed["sentiment"]]
    df_seed.rename(columns={"labels": "label_ex",
                            "text": "example",
                            "sentiment": "sentiment_ex"}, inplace=True)

    df_seed["paired_id"] = [i for i in range(len(df_seed))]
    df_seed["counterfactual"] = ["None" for i in range(len(df_seed))]

    df_origin.to_csv(f"../amazon/{out_dir}/origin_data.csv", sep="\t", index=False)
    df_n.to_csv(f"../amazon/{out_dir}/n_data.csv", sep="\t", index=False)
    df_seed.to_csv(f"../amazon/{out_dir}/seed_data.csv", sep="\t", index=False)

## n=5k & m=2.5k (n=2m)

In [35]:
produce_datasets(5000, 2500, "tr2")

7500
len origin: 7500
len n_data: 5000
len seed: 2500
len origin: 7500
len n_data: 5000
len seed: 2500


In [36]:
produce_datasets(2000, 1000, "tr1")

7500
len origin: 3000
len n_data: 2000
len seed: 1000
len origin: 3000
len n_data: 2000
len seed: 1000


In [37]:
produce_datasets(3000, 3000, "tr3")

7500
len origin: 6000
len n_data: 3000
len seed: 3000
len origin: 6000
len n_data: 3000
len seed: 3000
