In [110]:
import pandas as pd
import numpy as np

<h1>YELP restaurants</h1>

In [111]:
from datasets import load_dataset

yelp_data = load_dataset("yelp_polarity")

Reusing dataset yelp_polarity (/home/diego/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544)


  0%|          | 0/2 [00:00<?, ?it/s]

In [112]:
print(f"Total length of polarity yelp test {len(yelp_data['test'])}")
print("")

texts = yelp_data["test"]["text"]
labels = yelp_data["test"]["label"]

d = {"text": texts,
     "label": labels,
}

df_yelp = pd.DataFrame(data=d)
df_yelp["sentiment"] = df_yelp.apply(lambda row: "positive" if row['label'] else "negative", axis=1)
df_yelp.head(2)

Total length of polarity yelp test 38000



Unnamed: 0,text,label,sentiment
0,"Contrary to other reviews, I have zero complai...",1,positive
1,Last summer I had an appointment to get new ti...,0,negative


In [113]:
texts = df_yelp["text"].values
lens = [len(el.split(" ")) for el in texts]
print(f"Max len: {np.max(lens)}")
print(f"Mean len: {np.mean(lens)}")
print(f"Min len: {np.min(lens)}")

df_yelp["sentiment"].value_counts()

Max len: 1053
Mean len: 135.18473684210525
Min len: 1


positive    19000
negative    19000
Name: sentiment, dtype: int64

In [114]:
# filter out reviews longer than 2000 and shorter than 10 words
texts = df_yelp["text"].values
df_yelp["review_len"] = [len(el.split(" ")) for el in texts]
df_yelp = df_yelp[(df_yelp["review_len"] <= 2000) & (df_yelp["review_len"] >= 10)].copy()

lens = [len(el) for el in df_yelp["text"].values]
print(f"Max len: {np.max(lens)}")
print(f"Mean len: {np.mean(lens)}")
print(f"Min len: {np.min(lens)}")

df_yelp["label"].value_counts()

Max len: 5107
Mean len: 735.9356773692526
Min len: 39


0    18770
1    18573
Name: label, dtype: int64

In [115]:
df_yelp["label_counter"] = [int(not el) for el in df_yelp["label"]]
df_yelp["sentiment_counter"] = ["positive" if x=="negative" else "negative" for x in df_yelp["sentiment"]]
df_yelp.rename(columns={"label": "label_ex",
                        "text": "example",
                        "sentiment": "sentiment_ex"}, inplace=True)
df_yelp.head(2)

Unnamed: 0,example,label_ex,sentiment_ex,review_len,label_counter,sentiment_counter
0,"Contrary to other reviews, I have zero complai...",1,positive,126,0,negative
1,Last summer I had an appointment to get new ti...,0,negative,72,1,positive


In [116]:
df_pos = df_yelp[df_yelp["label_ex"] == 1].copy()
df_neg = df_yelp[df_yelp["label_ex"] == 0].copy()

## Sample 10k instances - 5k positive, 5k negative. Train-test split 80%-20%

In [117]:
seed = 2023
# sample positive and negative labels
data_pos = df_pos.sample(n=5000, replace=False, random_state=seed)
data_neg = df_neg.sample(n=5000, replace=False, random_state=seed)

# sample train-test split positive labels (80-20)%
test_pos = data_pos.sample(frac=0.2, replace=False, random_state=seed)
train_pos = data_pos[~data_pos.index.isin(test_pos.index)]

# sample train-test split negative labels (80-20)%
test_neg = data_neg.sample(frac=0.2, replace=False, random_state=seed)
train_neg = data_neg[~data_neg.index.isin(test_neg.index)]

# build train and test
testset = test_neg.append(test_pos)
trainset = train_neg.append(train_pos)

print()
print(f"len test: {len(testset)}")
print(f"len train: {len(trainset)}")


len test: 2000
len train: 8000


In [118]:
testset.to_csv("../yelp/test.csv", sep="\t", index=False)
trainset.to_csv("../yelp/train.csv", sep="\t", index=False)

In [119]:
def produce_datasets(n, m, out_dir):
     train_set = pd.read_csv("../yelp/train.csv", sep="\t")
     print(len(train_set))

     df_origin = train_set.sample(n=n+m, replace=False, random_state=seed)
     df_n = df_origin.sample(n=n, replace=False, random_state=seed)
     df_seed = df_n.sample(n=m, replace=False, random_state=seed)

     print(f"len origin: {len(df_origin)}")
     print(f"len n_data: {len(df_n)}")
     print(f"len seed: {len(df_seed)}")

     df_origin = trainset.sample(n=n+m, replace=False, random_state=seed)
     df_n = df_origin.sample(n=n, replace=False, random_state=seed)
     df_seed = df_n.sample(n=m, replace=False, random_state=seed)

     print(f"len origin: {len(df_origin)}")
     print(f"len n_data: {len(df_n)}")
     print(f"len seed: {len(df_seed)}")

     df_seed["paired_id"] = [i for i in range(len(df_seed))]
     df_seed["counterfactual"] = ["None" for i in range(len(df_seed))]

     df_origin.to_csv(f"../yelp/{out_dir}/origin_data.csv", sep="\t", index=False)
     df_n.to_csv(f"../yelp/{out_dir}/n_data.csv", sep="\t", index=False)
     df_seed.to_csv(f"../yelp/{out_dir}/seed_data.csv", sep="\t", index=False)

## n=5k & m=2.5k (n=2m)

In [120]:
produce_datasets(5000, 2500, "n_5k-m_2.5k")

8000
len origin: 7500
len n_data: 5000
len seed: 2500
len origin: 7500
len n_data: 5000
len seed: 2500


In [121]:
produce_datasets(2000, 1000, "n_2k-m_1k")

8000
len origin: 3000
len n_data: 2000
len seed: 1000
len origin: 3000
len n_data: 2000
len seed: 1000


The final size of each training set will be m+m+m=2m+m=n+m. The following sets are sampled from train.csv

- origin.csv stores n+m original data points
- m_data.csv
- seed_data.csv is a sample (size m) from n_data

You now need to produce m countefactuals from seed_data!! You will then use n_data.csv and m generated counterfactuals to train your classfier