In [154]:
import pandas as pd
import numpy as np

<h1>YELP restaurants</h1>

In [155]:
from datasets import load_dataset

yelp_data = load_dataset("yelp_polarity")

Reusing dataset yelp_polarity (/home/diego/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544)


  0%|          | 0/2 [00:00<?, ?it/s]

In [156]:
print(f"Total length of polarity yelp test {len(yelp_data['test'])}")
print("")

texts = yelp_data["test"]["text"]
labels = yelp_data["test"]["label"]

d = {"text": texts,
     "labels": labels,
}

df_yelp = pd.DataFrame(data=d)
df_yelp["sentiment"] = df_yelp.apply(lambda row: "positive" if row['labels'] else "negative", axis=1)
df_yelp.head(2)

Total length of polarity yelp test 38000



Unnamed: 0,text,labels,sentiment
0,"Contrary to other reviews, I have zero complai...",1,positive
1,Last summer I had an appointment to get new ti...,0,negative


In [157]:
texts = df_yelp["text"].values
lens = [len(el.split(" ")) for el in texts]
print(f"Max len: {np.max(lens)}")
print(f"Mean len: {np.mean(lens)}")
print(f"Min len: {np.min(lens)}")

df_yelp["sentiment"].value_counts()

Max len: 1053
Mean len: 135.18473684210525
Min len: 1


positive    19000
negative    19000
Name: sentiment, dtype: int64

In [167]:
# filter out reviews longer than 2000 and shorter than 10 words
texts = df_yelp["text"].values
df_yelp["review_len"] = [len(el.split(" ")) for el in texts]
df_yelp = df_yelp[(df_yelp["review_len"] <= 2000) & (df_yelp["review_len"] >= 10)].copy()

lens = [len(el) for el in df_yelp["text"].values]
print(f"Max len: {np.max(df_yelp['review_len'].values)}")
print(f"Mean len: {np.mean(df_yelp['review_len'].values)}")
print(f"Min len: {np.min(df_yelp['review_len'].values)}")

df_yelp["labels"].value_counts()

Max len: 1053
Mean len: 137.45791714645316
Min len: 10


0    18770
1    18573
Name: labels, dtype: int64

In [159]:
df_pos = df_yelp[df_yelp["labels"] == 1].copy()
df_neg = df_yelp[df_yelp["labels"] == 0].copy()

## Sample 9k instances - 4.5k positive, 4.5k negative. Train-test split 8k-1k%

In [160]:
seed = 2023

# shuffle data
df_pos = df_pos.sample(frac = 1, replace=False)
df_neg = df_neg.sample(frac = 1, replace=False)

# sample positive and negative labels
data_pos = df_pos.sample(n=4500, replace=False, random_state=seed)
data_neg = df_neg.sample(n=4500, replace=False, random_state=seed)

# sample train-test split positive labels
test_pos = data_pos.sample(n=500, replace=False, random_state=seed)
train_pos = data_pos[~data_pos.index.isin(test_pos.index)]

# sample train-test split negative labels (80-20)%
test_neg = data_neg.sample(n=500, replace=False, random_state=seed)
train_neg = data_neg[~data_neg.index.isin(test_neg.index)]

# build train and test
testset = test_neg.append(test_pos)
trainset = train_neg.append(train_pos)

print()
print(f"len test: {len(testset)}")
print(f"len train: {len(trainset)}")


len test: 1000
len train: 8000


In [161]:
testset.to_csv("../yelp/test.csv", sep="\t", index=False)
trainset.to_csv("../yelp/train.csv", sep="\t", index=False)

In [162]:
def produce_datasets(n, m, out_dir):
     train_set = pd.read_csv("../yelp/train.csv", sep="\t")
     print(len(train_set))

     df_origin = train_set.sample(n=n+m, replace=False, random_state=seed)
     df_n = df_origin.sample(n=n, replace=False, random_state=seed)
     df_seed = df_n.sample(n=m, replace=False, random_state=seed)

     print(f"len origin: {len(df_origin)}")
     print(f"len n_data: {len(df_n)}")
     print(f"len seed: {len(df_seed)}")

     df_origin = trainset.sample(n=n+m, replace=False, random_state=seed)
     df_n = df_origin.sample(n=n, replace=False, random_state=seed)
     df_seed = df_n.sample(n=m, replace=False, random_state=seed)

     print(f"len origin: {len(df_origin)}")
     print(f"len n_data: {len(df_n)}")
     print(f"len seed: {len(df_seed)}")

     df_seed["label_counter"] = [int(not el) for el in df_seed["labels"]]
     df_seed["sentiment_counter"] = ["positive" if x=="negative" else "negative" for x in df_seed["sentiment"]]
     df_seed.rename(columns={"labels": "label_ex",
                             "text": "example",
                             "sentiment": "sentiment_ex"}, inplace=True)

     df_seed["paired_id"] = [i for i in range(len(df_seed))]
     df_seed["counterfactual"] = ["None" for i in range(len(df_seed))]

     df_origin.to_csv(f"../yelp/{out_dir}/origin_data.csv", sep="\t", index=False)
     df_n.to_csv(f"../yelp/{out_dir}/n_data.csv", sep="\t", index=False)
     df_seed.to_csv(f"../yelp/{out_dir}/seed_data.csv", sep="\t", index=False)

     print(df_seed.head(2))

## n=5k & m=2.5k (n=2m)

In [163]:
produce_datasets(5000, 2500, "tr2")

8000
len origin: 7500
len n_data: 5000
len seed: 2500
len origin: 7500
len n_data: 5000
len seed: 2500
                                                 example  label_ex  \
37151  Love Cheddars. We were so excited when they op...         1   
24672  We ordered the poutine. No gravy and was flavo...         0   

      sentiment_ex  review_len  label_counter sentiment_counter  paired_id  \
37151     positive          26              0          negative          0   
24672     negative          38              1          positive          1   

      counterfactual  
37151           None  
24672           None  


In [164]:
produce_datasets(2000, 1000, "tr1")

8000
len origin: 3000
len n_data: 2000
len seed: 1000
len origin: 3000
len n_data: 2000
len seed: 1000
                                                example  label_ex  \
7468  I've stayed here over ten to fifteen times ove...         0   
3861  Perfect Barbers! My sons have been to various ...         1   

     sentiment_ex  review_len  label_counter sentiment_counter  paired_id  \
7468     negative         108              1          positive          0   
3861     positive         164              0          negative          1   

     counterfactual  
7468           None  
3861           None  


The final size of each training set will be m+m+m=2m+m=n+m. The following sets are sampled from train.csv

- origin.csv stores n+m original data points
- m_data.csv
- seed_data.csv is a sample (size m) from n_data

You now need to produce m countefactuals from seed_data!! You will then use n_data.csv and m generated counterfactuals to train your classfier

In [165]:
test = pd.read_csv("/home/diego/counterfactuals-generation/cat_sentiment/yelp/test.csv", sep='\t')
test.head(2)

Unnamed: 0,text,labels,sentiment,review_len
0,The worst Panera experience ever! First my gir...,0,negative,117
1,What is wrong with you people. how hard is it...,0,negative,111


In [166]:
len(test)

1000