Environment details: `rattle` `hosp_respo_bart`

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None) # don't truncate cell contents

from df_utils import *

In [2]:
data_dir = Path('/srv/scratch6/kew/bart/hospo_respo/en/data/hotel/500k')
scored_data = [
    'trip_hotels.pkl', 
    'trip_hotels.gen_sts_scores.pkl', 
    'trip_hotels.response_ppl_scores.pkl', 
    'trip_hotels.zipf_dist_scores.pkl', 
    'trip_hotels.response_tfidf_scores_norm.pkl', 
    'trip_hotels.rev_resp_sts_scores.pkl',
    'trip_hotels.rev_resp_tfidf_scores.pkl',]


In [3]:
df = pd.concat([pd.read_pickle(data_dir / f) for f in scored_data], axis=1)
print(len(df))
df.drop(columns=['Unnamed: 0'], inplace=True)
print(df.columns)

500000
Index(['domain', 'rating', 'review_author', 'response_author', 'review_clean',
       'response_clean', 'sentiment', 'db_internal_id', 'establishment',
       'trip_id', 'trip_url', 'country', 'split',
       'score:review_response_length_ratio', 'score:response_sentence_length',
       'score:genericness_semantic_avg', 'score:genericness_length_ratio',
       'score:review_response_wmd', 'rrgen_id', 'trip_review_url',
       'trip_review_id', 'score:genericness_sent_level_sts_avg',
       'score:response_text_ppl', 'score:response_freq_distro_ratio',
       'score:response_tfidf', 'score:rev_resp_sts', 'score:rev_resp_tfidf'],
      dtype='object')


In [4]:
def filter_df(df, col_name, min_threshold=0.0, max_threshold=1.0):
    orig_len = len(df)
    print(df[col_name].describe())
    df = df[(df[col_name] > min_threshold) & (df[col_name] < max_threshold)]
    print(f'\nRemoved {orig_len-len(df)} items. New length {len(df)} ({(len(df)/orig_len)*100:.2f}%)\n')
    print(df['split'].value_counts())
    print()
    print(df[col_name].describe())
    return df
  
def filter_df_by_percent(df, col_name, lower_bound=0.0, upper_bound=0.9):
    orig_len = len(df)
    print(df[col_name].describe())
    df['rank'] = df[col_name].rank(pct=True)
    df = df[(df['rank'] > lower_bound) & (df[col_name] < upper_bound)]
    print(f'\nRemoved {orig_len-len(df)} items. New length {len(df)} ({(len(df)/orig_len)*100:.2f}%)\n')
    print(df['split'].value_counts())
    print()
    print(df[col_name].describe())
    return df

In [5]:
# # Filter criteria: score:rev_resp_tfidf
# min_threshold=0.118
# max_threshold=0.60

# df_filt = filter_df(df, 'score:rev_resp_tfidf', min_threshold, max_threshold)
# print(len(df_filt))
# df_filt.sample(n=100)[['review_clean', 'response_clean', 'score:rev_resp_tfidf']]

# # # # Fairseq (useful columns line-aligned)
# OUTPATH = Path(f'/srv/scratch6/kew/bart/hospo_respo/en/data/hotel/filt_rev_resp_tfidf_{min_threshold}_{max_threshold}')
# OUTPATH.mkdir(parents=True, exist_ok=False)
# generate_fairseq_input_files(df_filt, OUTPATH, col_name_outfile_mapping, 'split')


count    500000.000000
mean          0.114507
std           0.100590
min           0.000000
25%           0.032881
50%           0.091500
75%           0.170954
max           0.947225
Name: score:rev_resp_tfidf, dtype: float64

Removed 298024 items. New length 201976 (40.40%)

train    181564
test      10213
valid     10199
Name: split, dtype: int64

count    201976.000000
mean          0.212675
std           0.080377
min           0.118001
25%           0.150589
50%           0.191805
75%           0.253684
max           0.599990
Name: score:rev_resp_tfidf, dtype: float64
201976
train split has length: 181564
test split has length: 10213
valid split has length: 10199
Done!


In [27]:
# # Filter criteria: score:rev_resp_sts
# min_threshold=0.51
# max_threshold=0.80

# df_filt = filter_df(df, 'score:rev_resp_sts', min_threshold, max_threshold)
# print(len(df_filt))
# df_filt.sample(n=100)[['review_clean', 'response_clean', 'score:rev_resp_sts']]

# # # # Fairseq (useful columns line-aligned)
# OUTPATH = Path(f'/srv/scratch6/kew/bart/hospo_respo/en/data/hotel/filt_rev_resp_sts_{min_threshold}_{max_threshold}')
# OUTPATH.mkdir(parents=True, exist_ok=False)
# generate_fairseq_input_files(df_filt, OUTPATH, col_name_outfile_mapping, 'split')

count    500000.000000
mean          0.470162
std           0.134350
min          -0.138696
25%           0.381874
50%           0.478115
75%           0.567229
max           0.993149
Name: score:rev_resp_sts, dtype: float64

Removed 296845 items. New length 203155 (40.63%)

train    182862
valid     10195
test      10098
Name: split, dtype: int64

count    203155.000000
mean          0.596696
std           0.061699
min           0.510001
25%           0.545911
50%           0.585826
75%           0.636981
max           0.799946
Name: score:rev_resp_sts, dtype: float64
203155
train split has length: 182862
test split has length: 10098
valid split has length: 10195
Done!


In [11]:
# # Filter criteria: score:response_tfidf
# min_threshold=1.37
# max_threshold=1.8

# df_filt = filter_df(df, 'score:response_tfidf', min_threshold, max_threshold)
# print(len(df_filt))
# df_filt.sample(n=100)[['review_clean', 'response_clean', 'score:response_tfidf']]

# # # Fairseq (useful columns line-aligned)
# OUTPATH = Path(f'/srv/scratch6/kew/bart/hospo_respo/en/data/hotel/filt_response_tfidf_{min_threshold}_{max_threshold}')
# OUTPATH.mkdir(parents=True, exist_ok=False)
# generate_fairseq_input_files(df_filt, OUTPATH, col_name_outfile_mapping, 'split')


count    5.000000e+05
mean              inf
std               NaN
min      5.772067e-01
25%      1.250138e+00
50%      1.337468e+00
75%      1.435815e+00
max               inf
Name: score:response_tfidf, dtype: float64

Removed 299118 items. New length 200882 (40.18%)

train    180198
valid     10466
test      10218
Name: split, dtype: int64

count    200882.000000
mean          1.481152
std           0.090708
min           1.370001
25%           1.409506
50%           1.457911
75%           1.530558
max           1.799889
Name: score:response_tfidf, dtype: float64
200882
train split has length: 180198
valid split has length: 10466
test split has length: 10218
Done!


In [7]:
# # Filter criteria: score:genericness_sent_level_sts_avg

# min_threshold=0.0
# max_threshold=0.70

# # df_filt = filter_df_by_percent(df, 'score:genericness_sent_level_sts_avg', lower_bound=0.0, upper_bound=0.9)
# df_filt = filter_df(df, 'score:genericness_sent_level_sts_avg', min_threshold, max_threshold)
# df_filt.sample(n=10)

# # # Fairseq (useful columns line-aligned)
# OUTPATH = Path(f'/srv/scratch6/kew/bart/hospo_respo/en/data/hotel/filt_generic_sent_avg_{min_threshold}_{max_threshold}')
# OUTPATH.mkdir(parents=True, exist_ok=False)
# generate_fairseq_input_files(df_filt, OUTPATH, col_name_outfile_mapping, 'split')


count    500000.000000
mean          0.716032
std           0.082659
min           0.301155
25%           0.664460
50%           0.718864
75%           0.769825
max           0.996356
Name: score:genericness_sent_level_sts_avg, dtype: float64

Removed 297307 items. New length 202693 (40.54%)

train    182741
test      10033
valid      9919
Name: split, dtype: int64

count    202693.000000
mean          0.637848
std           0.051854
min           0.301155
25%           0.611259
50%           0.650888
75%           0.677859
max           0.700000
Name: score:genericness_sent_level_sts_avg, dtype: float64
train split has length: 182741
valid split has length: 9919
test split has length: 10033
Done!


In [10]:
# # Filter criteria: score:response_freq_distro_ratio
# min_threshold=0.0
# max_threshold=0.883

# # df_filt = filter_df_by_percent(df, 'score:response_freq_distro_ratio', 0.0, 0.9)
# df_filt = filter_df(df, 'score:response_freq_distro_ratio', min_threshold, max_threshold)
# df_filt.sample(n=10)

# # # # Fairseq (useful columns line-aligned)
# OUTPATH = Path(f'/srv/scratch6/kew/bart/hospo_respo/en/data/hotel/filt_freq_distro_{min_threshold}_{max_threshold}')
# OUTPATH.mkdir(parents=True, exist_ok=False)
# generate_fairseq_input_files(df_filt, OUTPATH, col_name_outfile_mapping, 'split')


count    500000.000000
mean          0.889702
std           0.055622
min           0.230769
25%           0.857143
50%           0.895238
75%           0.927536
max           1.000000
Name: score:response_freq_distro_ratio, dtype: float64

Removed 295847 items. New length 204153 (40.83%)

train    183885
test      10146
valid     10122
Name: split, dtype: int64

count    204153.000000
mean          0.837263
std           0.040990
min           0.230769
25%           0.818182
50%           0.848485
75%           0.867725
max           0.882979
Name: score:response_freq_distro_ratio, dtype: float64
train split has length: 183885
valid split has length: 10122
test split has length: 10146
Done!


In [22]:
# # Filter criteria: score:response_text_ppl

# min_threshold=23.5
# max_threshold=50

# # df_filt = filter_df_by_percent(df, 'score:response_text_ppl', lower_bound=0.0, upper_bound=0.9)
# df_filt = filter_df(df, 'score:response_text_ppl', min_threshold, max_threshold)
# # df_filt.sample(n=10)

# # # Fairseq (useful columns line-aligned)
# OUTPATH = Path(f'/srv/scratch6/kew/bart/hospo_respo/en/data/hotel/filt_tgt_ppl_{min_threshold}_{max_threshold}')
# OUTPATH.mkdir(parents=True, exist_ok=False)
# generate_fairseq_input_files(df_filt, OUTPATH, col_name_outfile_mapping, 'split')


count    500000.000000
mean         30.227169
std          62.865044
min           2.619564
25%          16.842829
50%          23.890131
75%          34.940752
max       34380.269531
Name: score:response_text_ppl, dtype: float64

Removed 297886 items. New length 202114 (40.42%)

train    181924
valid     10103
test      10087
Name: split, dtype: int64

count    202114.000000
mean         32.953289
std           7.073359
min          23.500004
25%          26.977563
50%          31.398352
75%          37.923999
max          49.999378
Name: score:response_text_ppl, dtype: float64
train split has length: 181924
test split has length: 10087
valid split has length: 10103
Done!


In [26]:
# Filter criteria: score:response_text_ppl
# for ablation experiments

# 100% (original values used in baseline)
# 80% (min_threshold=12, max_threshold=50)
# 60% (min_threshold=18, max_threshold=50)
# 40% (original values min_threshold=23.5, max_threshold=50)
# 20% (min_threshold=31.5, max_threshold=50)

min_threshold=12
max_threshold=50

# df_filt = filter_df_by_percent(df, 'score:response_text_ppl', lower_bound=0.0, upper_bound=0.9)
df_filt = filter_df(df, 'score:response_text_ppl', min_threshold, max_threshold)
print((len(df_filt)/len(df)) * 100)

# # Fairseq (useful columns line-aligned)
OUTPATH = Path(f'/srv/scratch6/kew/bart/hospo_respo/en/data/hotel/filt_tgt_ppl_{min_threshold}_{max_threshold}')
OUTPATH.mkdir(parents=True, exist_ok=False)
generate_fairseq_input_files(df_filt, OUTPATH, col_name_outfile_mapping, 'split')


count    500000.000000
mean         30.227169
std          62.865044
min           2.619564
25%          16.842829
50%          23.890131
75%          34.940752
max       34380.269531
Name: score:response_text_ppl, dtype: float64

Removed 99597 items. New length 400403 (80.08%)

train    360610
valid     19955
test      19838
Name: split, dtype: int64

count    400403.000000
mean         25.483996
std           9.339093
min          12.000057
25%          17.959231
50%          23.617277
75%          31.493897
max          49.999378
Name: score:response_text_ppl, dtype: float64
80.0806
train split has length: 360610
test split has length: 19838
valid split has length: 19955
Done!


In [28]:
# Filter based on all methods
# for ablation experiments

# Filter criteria: score:response_text_ppl
min_threshold=23.5
max_threshold=50
df_filt = filter_df(df, 'score:response_text_ppl', min_threshold, max_threshold)

print((len(df_filt)/len(df)) * 100)

# Filter criteria: score:response_freq_distro_ratio
min_threshold=0.0
max_threshold=0.883
df_filt = filter_df_by_percent(df_filt, 'score:response_freq_distro_ratio', min_threshold, max_threshold)

print((len(df_filt)/len(df)) * 100)

# Filter criteria: score:genericness_sent_level_sts_avg

min_threshold=0.0
max_threshold=0.70
df_filt = filter_df(df_filt, 'score:genericness_sent_level_sts_avg', min_threshold, max_threshold)

print((len(df_filt)/len(df)) * 100)

# # Fairseq (useful columns line-aligned)
OUTPATH = Path(f'/srv/scratch6/kew/bart/hospo_respo/en/data/hotel/filt_combo')
OUTPATH.mkdir(parents=True, exist_ok=False)
generate_fairseq_input_files(df_filt, OUTPATH, col_name_outfile_mapping, 'split')



count    500000.000000
mean         30.227169
std          62.865044
min           2.619564
25%          16.842829
50%          23.890131
75%          34.940752
max       34380.269531
Name: score:response_text_ppl, dtype: float64

Removed 297886 items. New length 202114 (40.42%)

train    181924
valid     10103
test      10087
Name: split, dtype: int64

count    202114.000000
mean         32.953289
std           7.073359
min          23.500004
25%          26.977563
50%          31.398352
75%          37.923999
max          49.999378
Name: score:response_text_ppl, dtype: float64
40.422799999999995
count    202114.000000
mean          0.872003
std           0.045393
min           0.428571
25%           0.844037
50%           0.875000
75%           0.902778
max           1.000000
Name: score:response_freq_distro_ratio, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rank'] = df[col_name].rank(pct=True)



Removed 85316 items. New length 116798 (57.79%)

train    105088
valid      5857
test       5853
Name: split, dtype: int64

count    116798.000000
mean          0.842271
std           0.032944
min           0.428571
25%           0.825243
50%           0.850000
75%           0.867347
max           0.882979
Name: score:response_freq_distro_ratio, dtype: float64
23.3596
count    116798.000000
mean          0.670736
std           0.066354
min           0.344647
25%           0.628650
50%           0.671926
75%           0.714678
max           0.964121
Name: score:genericness_sent_level_sts_avg, dtype: float64

Removed 38310 items. New length 78488 (67.20%)

train    70661
test      3926
valid     3901
Name: split, dtype: int64

count    78488.000000
mean         0.635788
std          0.047230
min          0.344647
25%          0.609011
50%          0.644865
75%          0.672627
max          0.699996
Name: score:genericness_sent_level_sts_avg, dtype: float64
15.697600000000001
train spli