In [None]:
import pandas as pd
import numpy as np
import os

<h1>YELP restaurants</h1>

In [None]:
from datasets import load_dataset

yelp_data = load_dataset("yelp_polarity")

In [None]:
print(f"Total length of polarity yelp test {len(yelp_data['test'])}")
print("")

texts = yelp_data["test"]["text"]
labels = yelp_data["test"]["label"]

d = {"text": texts,
     "label": labels,
}

df_yelp = pd.DataFrame(data=d)
df_yelp["sentiment"] = df_yelp.apply(lambda row: "positive" if row['label'] else "negative", axis=1)
df_yelp.head(2)

In [None]:
texts = df_yelp["text"].values
lens = [len(el) for el in texts]
print(f"Max len: {np.max(lens)}")
print(f"Mean len: {np.mean(lens)}")
print(f"Min len: {np.min(lens)}")

df_yelp["sentiment"].value_counts()

In [None]:
# filter out reviews longer than 2000 and shorter than 10
texts = df_yelp["text"].values
df_yelp["review_len"] = [len(el) for el in texts]
df_yelp = df_yelp[(df_yelp["review_len"] <= 2000) & (df_yelp["review_len"] >= 10)].copy()

lens = [len(el) for el in df_yelp["text"].values]
print(f"Max len: {np.max(lens)}")
print(f"Mean len: {np.mean(lens)}")
print(f"Min len: {np.min(lens)}")

df_yelp["label"].value_counts()

In [None]:
df_pos = df_yelp[df_yelp["label"] == 1].copy()
df_neg = df_yelp[df_yelp["label"] == 0].copy()

## Sample 10k instances - 5k positive, 5k negative. Train-test split 80%-20%

In [None]:
seed = 2023
# sample positive and negative labels
data_pos = df_pos.sample(n=5000, replace=False, random_state=seed)
data_neg = df_neg.sample(n=5000, replace=False, random_state=seed)

# sample train-test split positive labels (80-20)%
test_pos = data_pos.sample(frac=0.2, replace=False, random_state=seed)
train_pos = data_pos[~data_pos.index.isin(test_pos.index)]

# sample train-test split negative labels (80-20)%
test_neg = data_neg.sample(frac=0.2, replace=False, random_state=seed)
train_neg = data_neg[~data_neg.index.isin(test_neg.index)]

# build train and test
testset = test_neg.append(test_pos)
trainset = train_neg.append(train_pos)

print()
print(f"len test: {len(testset)}")
print(f"len train: {len(trainset)}")

In [None]:
testset.to_csv("../yelp/test.csv", sep="\t", index=False)
trainset.to_csv("../yelp/train.csv", sep="\t", index=False)

## n=5k & m=2.5k (n=2m)

In [None]:
n = 5000
m = 2500

In [None]:
trainset = pd.read_csv("../yelp/train.csv", sep="\t")
len(trainset)

In [None]:
df_origin = trainset.sample(n=n+m, replace=False, random_state=seed)
df_n = df_origin.sample(n=n, replace=False, random_state=seed)
df_seed = df_n.sample(n=m, replace=False, random_state=seed)

print(f"len origin: {len(df_origin)}")
print(f"len n_data: {len(df_n)}")
print(f"len seed: {len(df_seed)}")

In [None]:
df_origin.to_csv("../yelp/n_5k-m_2.5k/origin_data.csv", sep="\t", index=False)
df_n.to_csv("../yelp/n_5k-m_2.5k/n_data.csv", sep="\t", index=False)
df_seed.to_csv("../yelp/n_5k-m_2.5k/seed_data.csv", sep="\t", index=False)

The final size of each training set will be m+m+m=2m+m=n+m. The following sets are sampled from train.csv

- origin.csv stores n+m original data points
- m_data.csv
- seed_data.csv is a sample (size m) from n_data

You now need to produce m countefactuals from seed_data!! You will then use n_data.csv and m generated counterfactuals to train your classfier