In [1]:
from sklearn.datasets import load_files
import pandas as pd
import bs4
import numpy as np

First, get some stats about cad_imdb, which was used to build our generator

In [2]:
df_cad = pd.read_csv("../cad_imdb/fold_0/training_set", sep="\t")
cad_texts = df_cad["example"].values
lens = [len(el) for el in cad_texts]
print(f"Max len: {np.max(lens)}")
print(f"Mean len: {np.mean(lens)}")
print(f"Min len: {np.min(lens)}")

Max len: 1955
Mean len: 901.0530440024111
Min len: 80


<h1>Pang Imdb (1000 pos reviews; 1000 negative reviews)</h1>

We can't really filter out the few reviews we have. We keep the dataset as it is, admitting that reviews are in general longer than the ones in CAD imdb (used to train the generator)

In [3]:
movie_data = load_files('../imdb_pang')

# Remove HTML from reviews
reviews = [bs4.BeautifulSoup(r, features="lxml").get_text() for r in movie_data.data]

In [4]:
paired_id = [i for i in range(len(reviews))]
counter = ["None" for i in range(len(reviews))] # this column makes the generation compliant

d = {"example": reviews,
     "label_ex": movie_data.target,
     "label_counter": np.logical_not(movie_data.target).astype(int),
     "paired_id": paired_id,
     "counterfactual": counter
     }
df_reviews = pd.DataFrame(data=d)

lens = [len(el) for el in df_reviews["example"]]
print(f"Max len: {np.max(lens)}")
print(f"Mean len: {np.mean(lens)}")
print(f"Min len: {np.min(lens)}")

df_reviews.head(1)

Max len: 14957
Mean len: 3892.8675
Min len: 91


Unnamed: 0,example,label_ex,label_counter,paired_id,counterfactual
0,arnold schwarzenegger has been an icon for act...,0,1,0,


In [5]:
# divide dataframe into positives and negatives
df_pos = df_reviews[df_reviews["label_ex"] == 1].copy()
df_neg = df_reviews[df_reviews["label_ex"] == 0].copy()
print(f"len positive df: {len(df_pos)}")
print(f"len negative df: {len(df_neg)}")

len positive df: 1000
len negative df: 1000


In [6]:
seed = 2022
# sample balanced train-test split (75-25)%
test_pos = df_pos.sample(frac=0.25, replace=False, random_state=seed)
train_pos = df_pos[~df_pos.index.isin(test_pos.index)]
print(f"len positive test: {len(test_pos)}")
print(f"len positive train: {len(train_pos)}")

print()

# sample balanced train-test split (75-25)%
test_neg = df_neg.sample(frac=0.25, replace=False, random_state=seed)
train_neg = df_neg[~df_neg.index.isin(test_neg.index)]
print(f"len negative test: {len(test_neg)}")
print(f"len negative train: {len(train_neg)}")

# build train and test
testset = test_neg.append(test_pos)
trainset = train_neg.append(train_pos)

print()
print(f"len test: {len(testset)}")
print(f"len train: {len(trainset)}")

len positive test: 250
len positive train: 750

len negative test: 250
len negative train: 750

len test: 500
len train: 1500


In [7]:
testset.to_csv("../imdb_pang/test.csv", sep="\t", index=False)
trainset.to_csv("../imdb_pang/train.csv", sep="\t", index=False)

<h1>YELP restaurants</h1>

In [22]:
from datasets import load_dataset

yelp_data = load_dataset("yelp_polarity")

Reusing dataset yelp_polarity (/home/diego/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544)


  0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
print(f"Total length of polarity yelp test {len(yelp_data['test'])}")
print("")

labels = yelp_data["test"]["label"]
texts = yelp_data["test"]["text"]
paired_id = [i for i in range(len(texts))]
counter = ["None" for i in range(len(texts))] # this column makes the generation compliant

d = {"example": texts,
     "label_ex": labels,
     "label_counter": np.logical_not(labels).astype(int),
     "paired_id": paired_id,
     "counterfactual": counter
}
df_yelp = pd.DataFrame(data=d)
df_yelp.head(2)

Total length of polarity yelp test 38000



Unnamed: 0,example,label_ex,label_counter,paired_id,counterfactual
0,"Contrary to other reviews, I have zero complai...",1,0,0,
1,Last summer I had an appointment to get new ti...,0,1,1,


In [24]:
texts = df_yelp["example"].values
lens = [len(el) for el in texts]
print(f"Max len: {np.max(lens)}")
print(f"Mean len: {np.mean(lens)}")
print(f"Min len: {np.min(lens)}")

df_yelp["label_ex"].value_counts()

Max len: 5107
Mean len: 723.8446578947369
Min len: 4


1    19000
0    19000
Name: label_ex, dtype: int64

In [25]:
# filter out reviews longer than 2000
texts = df_yelp["example"].values
df_yelp["review_len"] = [len(el) for el in texts]
df_yelp = df_yelp[df_yelp["review_len"] <= 2000].copy()

lens = [len(el) for el in df_yelp["example"].values]
print(f"Max len: {np.max(lens)}")
print(f"Mean len: {np.mean(lens)}")
print(f"Min len: {np.min(lens)}")

df_yelp["label_ex"].value_counts()

Max len: 2000
Mean len: 613.4369834137682
Min len: 4


1    18359
0    17695
Name: label_ex, dtype: int64

In [26]:
df_pos = df_yelp[df_yelp["label_ex"] == 1].copy()
df_neg = df_yelp[df_yelp["label_ex"] == 0].copy()

In [27]:
seed = 2022
# sample positive labels
test_pos = df_pos.sample(n=250, replace=False, random_state=seed)
train_pos = df_pos[~df_pos.index.isin(test_pos.index)]
print(f"len positive test: {len(test_pos)}")
print(f"len positive train: {len(train_pos)}")

print()

# sample negative labels
test_neg = df_neg.sample(n=250, replace=False, random_state=seed)
train_neg = df_neg[~df_neg.index.isin(test_neg.index)]
print(f"len negative test: {len(test_neg)}")
print(f"len negative train: {len(train_neg)}")

# build train and test
testset = test_neg.append(test_pos)
trainset = train_neg.append(train_pos)

print()
print(f"len test: {len(testset)}")
print(f"len train: {len(trainset)}")

len positive test: 250
len positive train: 18109

len negative test: 250
len negative train: 17445

len test: 500
len train: 35554


In [28]:
testset.to_csv("../yelp/test.csv", sep="\t", index=False)
trainset.to_csv("../yelp/train.csv", sep="\t", index=False)