
Randomly extracts subset of review-response pairs from large dataset for ease of processing


In [1]:
from typing import Dict, List
from pathlib import Path

import numpy as np
import pandas as pd
import qgrid
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)


In [7]:
SEED = 42
N_EXAMPLES=500000
DATA = '/mnt/storage/clwork/projects/readvisor/RESPONSE_GENERATION/intermediary/en_rrgen.alphasys.scored.rg.pkl'
OUTPATH = Path('/srv/scratch6/kew/bart/hospo_respo/en/data/hotel/500k/')
OUTPATH.mkdir(parents=True, exist_ok=True)


In [8]:
# load dataframe
df = pd.read_pickle(DATA)

# add working ID column
df['rrgen_id'] = df.index

print(df['domain'].value_counts())
print(df['source'].value_counts())

# subset for tripadvisor only
df = df[df['source'] == 'tripadvisor']
df = df[df['domain'] == 'Hotel']

print(df['domain'].value_counts())
print(df['source'].value_counts())

# drop unwanted columns
df.drop(columns=['reviewid', 'grpid', 'platformid_rev',
        'url', 'platformrating', 'review_pp', 'review_lang',
        'response_pp', 'response_lang', 'source',
         'sentiment_alpha_system_1', 'response_pp_rg', 'split_imrg_compat'], inplace=True)

# minimum response length (in sentences) is 1!
df = df[df['score:response_sentence_length'] > 1]

df.info()

Hotel         2280886
Restaurant    1046717
Name: domain, dtype: Int64
tripadvisor    3306331
platform         13222
re                8050
Name: source, dtype: Int64
Hotel    2275802
Name: domain, dtype: Int64
tripadvisor    2275802
Name: source, dtype: Int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2177752 entries, 0 to 3327598
Data columns (total 19 columns):
 #   Column                              Dtype  
---  ------                              -----  
 0   domain                              string 
 1   rating                              Int64  
 2   review_author                       string 
 3   response_author                     string 
 4   review_clean                        string 
 5   response_clean                      string 
 6   sentiment                           string 
 7   db_internal_id                      Int64  
 8   establishment                       string 
 9   trip_id                             Int64  
 10  trip_url                         

In [9]:
def inspect_scores(df):
    print(len(df))
    print()
    print(df['split'].value_counts())
    print(df['score:review_response_length_ratio'].describe())
    print()
    print(df['score:response_sentence_length'].describe())
    print()
    print(df['score:genericness_semantic_avg'].describe())
    print()
    print(df['score:genericness_length_ratio'].describe())
    print()
    print(df['score:review_response_wmd'].describe())

In [10]:
inspect_scores(df)

2177752

train    1960116
test      108834
valid     108802
Name: split, dtype: int64
count    2.177752e+06
mean     1.885855e+00
std      1.481366e+00
min      2.380952e-02
25%      1.000000e+00
50%      1.500000e+00
75%      2.333333e+00
max      7.200000e+01
Name: score:review_response_length_ratio, dtype: float64

count    2.177752e+06
mean     4.391562e+00
std      2.104128e+00
min      2.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      5.000000e+00
max      1.450000e+02
Name: score:response_sentence_length, dtype: float64

count    2.177752e+06
mean     7.440869e-01
std      1.033905e-01
min      2.258799e-01
25%      6.801786e-01
50%      7.503334e-01
75%      8.154380e-01
max      1.000001e+00
Name: score:genericness_semantic_avg, dtype: float64

count    2.177752e+06
mean     7.271684e-02
std      1.420643e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.000000e-01
max      1.000000e+00
Name: score:genericness_length_ratio, dtype:

In [11]:

# shuffle the dataframe
df = df.sample(frac=1, random_state=SEED, axis=0)
df.head()



Unnamed: 0,domain,rating,review_author,response_author,review_clean,response_clean,sentiment,db_internal_id,establishment,trip_id,trip_url,country,split,score:review_response_length_ratio,score:response_sentence_length,score:genericness_semantic_avg,score:genericness_length_ratio,score:review_response_wmd,rrgen_id
2329375,Hotel,5,Lucy J,RCCharlotte,Fantastic! ---SEP--- We were treated like roya...,"Dear Lucy J, Thank you for sharing your experi...",3 -1 3,126080,"The Ritz-Carlton, Charlotte",1510383,https://www.tripadvisor.com/Hotel_Review-g4902...,US,train,1.6,5,0.8795,0.4,4.637085,2329375
2770320,Hotel,4,OYEWOLE O,CPHIEXManagementTeam,Convenient ---SEP--- The hotel is custom-made ...,"Dear OYEWOLE O, Thank you for your kind feedba...",2 -2 2,147558,Crowne Plaza London Heathrow T4,14151392,https://www.tripadvisor.co.uk/Hotel_Review-g52...,UK,train,1.333333,6,0.829168,0.166667,4.310637,2770320
1652943,Hotel,4,Jeff M,fairholminnkeeper,"Authentic house, authentic people ---SEP--- St...","Jeff, Thank you for taking the time to write s...",2 -2 2,359345,Fairholm National Historic Inn,269422,https://www.tripadvisor.ca/Hotel_Review-g15502...,CA,train,4.4,5,0.711277,0.2,4.748927,1652943
907612,Hotel,4,rdsnyder961,General M,Very handy 'stay and fly' ---SEP--- Newly reno...,"Thank you, rdsnyder961! We're so happy that we...",3 -1 3,665760,La Quinta Inn & Suites by Wyndham Denver Airpo...,85368,https://www.tripadvisor.com/Hotel_Review-g3338...,US,train,1.333333,3,0.8815,0.333333,4.728153,907612
1013061,Hotel,3,Ray-Roma,Sobrien_GM,"some difficulties ---SEP--- Lovely, historic h...","Dear Ray-Roma, Thank you for your feedback and...",2 -2 2,408400,One King West Hotel & Residence,573658,https://www.tripadvisor.ca/Hotel_Review-g15501...,CA,train,0.6,5,0.688064,0.0,4.733643,1013061


In [12]:
df = df.head(N_EXAMPLES)

In [13]:
inspect_scores(df)
# df_n[df_n['score:genericness_semantic_avg'].isin([np.nan, np.inf, -np.inf])]

500000

train    450367
valid     24897
test      24736
Name: split, dtype: int64
count    500000.000000
mean          1.886348
std           1.493387
min           0.031250
25%           1.000000
50%           1.500000
75%           2.333333
max          72.000000
Name: score:review_response_length_ratio, dtype: float64

count    500000.000000
mean          4.389528
std           2.097126
min           2.000000
25%           3.000000
50%           4.000000
75%           5.000000
max          76.000000
Name: score:response_sentence_length, dtype: float64

count    500000.000000
mean          0.744152
std           0.103337
min           0.256568
25%           0.680296
50%           0.750349
75%           0.815469
max           1.000000
Name: score:genericness_semantic_avg, dtype: float64

count    500000.000000
mean          0.072869
std           0.142241
min           0.000000
25%           0.000000
50%           0.000000
75%           0.100000
max           1.000000
Name: score:gene

In [14]:
col_name_outfile_mapping = {
    'rrgen_id': 'rrgen_id', 
    'review_clean': 'review', # normal review 
    'response_clean': 'response',  # normal response
    'rating': 'rating', # normal review rating
    'establishment': 'establishment',
}

def write_file(series, outfile):
    with open(outfile, 'w', encoding='utf8') as f:
        for line in series.to_list():
            f.write(f'{line}\n')
    return

def generate_fairseq_input_files(df,
                                 outdir: str,
                                 col_name_outfile_mapping: Dict = col_name_outfile_mapping,
                                 split_col: str = 'split',
                                 n: int = 0):
    """
    Generates multiple individual files (one per column).
    For each split (train/test/valid) lines in each output file must correspond with each other!
    """
    for split in df[split_col].unique():  
        
        split_df = df[df[split_col] == split]
        
        # shuffle train set - mainly required after upsampling!
        if split == 'train':
            split_df = split_df.sample(frac=1, random_state=SEED)
        
        if n: # just take a head of dataframe
            if split == 'train':
                split_df = split_df.head(n)
            else:
                split_df = split_df.head(int(n*0.1))

        print(f'{split} split has length: {len(split_df)}')

        for k, v in col_name_outfile_mapping.items():
            write_file(split_df[k], outdir / f'{split}.{v}')
        
    print('Done!')
    return


# pickle (as backup)
df.to_pickle(str(OUTPATH / 'trip_hotels.pkl'))
# Huggingface csv
df.to_csv(str(OUTPATH / 'trip_hotels.csv'))
# Fairseq (useful columns line-aligned)
generate_fairseq_input_files(df, OUTPATH, col_name_outfile_mapping, 'split')


train split has length: 450367
valid split has length: 24897
test split has length: 24736
Done!


Hotel         2280886
Restaurant    1046717
Name: domain, dtype: Int64
tripadvisor    3306331
platform         13222
re                8050
Name: source, dtype: Int64


Unnamed: 0,domain,rating,review_author,response_author,review_clean,response_clean,sentiment,db_internal_id,establishment,trip_id,trip_url,country,split,score:review_response_length_ratio,score:response_sentence_length,score:genericness_semantic_avg,score:genericness_length_ratio,score:review_response_wmd,rrgen_id
0,Hotel,5,CharlieGirl83,StGilesHouse,Perfect relaxing city getaway! ---SEP--- In ne...,Many thanks for such positive feedback from yo...,2 -2 3,877790,St Giles House Hotel,637093.0,https://www.tripadvisor.co.uk/Hotel_Review-g18...,UK,train,5.4,5,0.718896,0.0,4.243646,0
1,Hotel,4,Travel Man,FrancisBath,Wonderful Hotel but hopeless service at breakf...,Thank you for your comments; all feedback is v...,2 -2 -2,40346,Francis Hotel Bath - MGallery,192990.0,https://www.tripadvisor.co.uk/Hotel_Review-g18...,UK,train,2.0,2,0.87233,0.0,4.90256,1
2,Hotel,5,Jim L,,Lovely Hotel ---SEP--- We stayed 2 nights in t...,"Dear Guest, Thank you for your feedback, we ar...",3 -1 4,654882,The Parkview Hotel Mudgee,12339572.0,https://www.tripadvisor.com.au/Hotel_Review-g2...,AU,train,1.0,3,0.856633,0.333333,4.846475,2
3,Restaurant,5,NZ1969,Polariceskiwi,Love the chocolate beer cake ---SEP--- We've b...,Many Thanks. Also I test the Cake myself.... YUM,3 -2 3,257928,Bier Kafe-Taupo Taupo District Waikato Region ...,,https://www.tripadvisor.co.nz/Restaurant_Revie...,NZ,train,1.5,2,0.597344,0.0,4.636237,3
4,Restaurant,2,Alan C,CourtneyGuestService,Very Disappointing ---SEP--- I visit Margarita...,We sincerely appreciate you and all of your vi...,2 -2 -2,645894,Margaritaville Las Vegas-Las Vegas Nevada,,https://www.tripadvisor.com/Restaurant_Review-...,US,train,2.833333,6,0.756582,0.166667,4.092634,4


In [42]:
print(len(df))
print(df['trip_url'].isnull().sum())

3306331
0
