In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
import json
import pandas as pd
import numpy as np
# import spacy
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', None)

In [3]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict

# Read Data

In [4]:
rev_kp_dataset_df = pd.read_pickle("../data/training/train_data.pkl")

In [5]:
rev_kp_dataset_df

Unnamed: 0,id_pair,sentences,key_point,business_name,domain,opinions_x,aspects_x,opinion_phrases_x,attributes,aspects_y,opinions_y,opinion_phrases_y,sentiments_x,sentiments_y,label
0,459751_0##966072_0,I am in love with this hotel .,Hotel was clean and modern .,Kimpton Hotel Monaco Pittsburgh,hotel,[love],[hotel],[love hotel],[restaurant -> atmosphere],[hotel],[clean],[clean hotel],[positive],[positive],1.0
1,459751_1##922097_1,The staff is just amazing !,Staff goes above and beyond .,Kimpton Hotel Monaco Pittsburgh,hotel,[amazing],[staff],[amazing staff],[staff],[staff],[above and beyond],[above and beyond staff],[positive],[positive],1.0
2,459751_1##459760_2,The staff is just amazing !,The staff accommodated our needs .,Kimpton Hotel Monaco Pittsburgh,hotel,[amazing],[staff],[amazing staff],[staff],[staff],[accommodated],[accommodated staff],[positive],[positive],1.0
3,459751_1##820265_1,The staff is just amazing !,Staff was accommodating and friendly .,Kimpton Hotel Monaco Pittsburgh,hotel,[amazing],[staff],[amazing staff],[staff],[staff],[accommodating],[accommodating staff],[positive],[positive],1.0
4,459751_1##671070_1,The staff is just amazing !,Attentive staff and housekeeping .,Kimpton Hotel Monaco Pittsburgh,hotel,[amazing],[staff],[amazing staff],[staff],[staff],[attentive],[attentive staff],[positive],[positive],1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68485,1013513_2##998143_2,We started with the complementary ( refillable ) bread .,Crispy and perfectly seasoned .,Smith Bros.,restaurant,[refillable],[bread],[refillable bread],[food -> quality],[seasoned],[perfectly],[perfectly seasoned],[positive],[positive],0.0
68486,991573_5##998143_2,So fresh and full of amazing flavor soft not chewy perfection .,Crispy and perfectly seasoned .,Bavette's Steakhouse & Bar,restaurant,[amazing],[flavor],[amazing flavor],[food -> quality],[seasoned],[perfectly],[perfectly seasoned],[positive],[positive],0.0
68487,1025790_0##998143_2,We also ordered an assortment of their thoughtfully created pizzas .,Crispy and perfectly seasoned .,Bar Cento,restaurant,[thoughtfully created],[pizzas],[thoughtfully created pizzas],[food -> quality],[seasoned],[perfectly],[perfectly seasoned],[positive],[positive],0.0
68488,994763_3##998143_2,"We ordered the lamb , which had no seasoning .",Crispy and perfectly seasoned .,Indigo Crow Restaurant & Bar,restaurant,[no seasoning],[lamb],[no seasoning lamb],[food -> quality],[seasoned],[perfectly],[perfectly seasoned],[negative],[positive],0.0


In [6]:
rev_kp_dataset_df = rev_kp_dataset_df.rename(columns={'attributes': 'topic', 'sentences': 'text'})
rev_kp_dataset_df['topic'] = rev_kp_dataset_df['topic'].apply(lambda x: ','.join(x))

# Prepare Contrastive Input

In [7]:
rev_kp_dataset_df = rev_kp_dataset_df.rename(columns={'text': 'full_comment', 'key_point': 'full_keypoint'})

In [8]:
def prepare_contrastive_input(row):
    return row.iloc[0][0] + ' <SEP> ' + row.iloc[1][0]

In [9]:
rev_kp_dataset_df['comment'] = rev_kp_dataset_df[['opinion_phrases_x', 'sentiments_x']].apply(prepare_contrastive_input, axis=1)
rev_kp_dataset_df['comment']

0                         love hotel <SEP> positive
1                      amazing staff <SEP> positive
2                      amazing staff <SEP> positive
3                      amazing staff <SEP> positive
4                      amazing staff <SEP> positive
                            ...                    
68485               refillable bread <SEP> positive
68486                 amazing flavor <SEP> positive
68487    thoughtfully created pizzas <SEP> positive
68488              no seasoning lamb <SEP> negative
68489                consistent food <SEP> positive
Name: comment, Length: 68490, dtype: object

In [10]:
rev_kp_dataset_df['keypoint'] = rev_kp_dataset_df[['opinion_phrases_y', 'sentiments_y']].apply(prepare_contrastive_input, axis=1)
rev_kp_dataset_df['keypoint']

0                   clean hotel <SEP> positive
1        above and beyond staff <SEP> positive
2            accommodated staff <SEP> positive
3           accommodating staff <SEP> positive
4               attentive staff <SEP> positive
                         ...                  
68485        perfectly seasoned <SEP> positive
68486        perfectly seasoned <SEP> positive
68487        perfectly seasoned <SEP> positive
68488        perfectly seasoned <SEP> positive
68489        perfectly seasoned <SEP> positive
Name: keypoint, Length: 68490, dtype: object

# Train-Dev Split

In [11]:
dataset = Dataset.from_pandas(rev_kp_dataset_df[['topic', 'comment', 'full_comment',
                                                 'keypoint', 'full_keypoint', 
                                                 'domain', 'label']]).shuffle(seed=42)

In [12]:
from sklearn.model_selection import KFold, GroupKFold
from sklearn.model_selection import GroupShuffleSplit, ShuffleSplit

In [13]:
## Train/Val Split by group
groups = dataset['topic']
gss = GroupShuffleSplit(n_splits=1, train_size=0.85, random_state=32)

for train_idx, val_idx in gss.split(dataset, groups=groups):
    dataset.set_format(None)
    
    # Define data loaders for training and testing data in this fold
    train_dataset = Dataset.from_dict(dataset[train_idx])
    val_dataset = Dataset.from_dict(dataset[val_idx])
    
    train_dataset.set_format("pandas")
    d = train_dataset[:]
    print("TRAIN CLASSES: ", len(d['topic'].unique()))
    train_dataset.set_format(None)

    val_dataset.set_format("pandas")
    d = val_dataset[:]
    print("VAL CLASSES: ", len(d['topic'].unique()))
    val_dataset.set_format(None)

TRAIN CLASSES:  11
VAL CLASSES:  3


In [14]:
datasets = DatasetDict({
    "train": train_dataset,
    "valid": val_dataset})
datasets

DatasetDict({
    train: Dataset({
        features: ['topic', 'comment', 'full_comment', 'keypoint', 'full_keypoint', 'domain', 'label'],
        num_rows: 64344
    })
    valid: Dataset({
        features: ['topic', 'comment', 'full_comment', 'keypoint', 'full_keypoint', 'domain', 'label'],
        num_rows: 4146
    })
})

In [15]:
datasets.set_format("pandas")

In [16]:
training_df = datasets['train'][:]
valid_df = datasets['valid'][:]

In [17]:
training_df

Unnamed: 0,topic,comment,full_comment,keypoint,full_keypoint,domain,label
0,restaurant -> atmosphere,awesome vibe <SEP> positive,The vibe was awesome .,great furniture <SEP> positive,Great furniture and decor .,restaurant,0.0
1,restaurant -> atmosphere,wonderful resort <SEP> positive,The montelucia is a wonderful resort .,worst experience <SEP> negative,Worst experience of my life .,hotel,0.0
2,food -> quality,great indian food <SEP> positive,GO TO THIS PLACE FOR GREAT INDIAN FOOD !,great food <SEP> positive,Great food and beer selection .,restaurant,1.0
3,staff,always been great service <SEP> positive,the service has always been great .,very attentive service <SEP> positive,Service was very attentive .,restaurant,1.0
4,food -> quality,excellent indian food <SEP> positive,Excellent Indian food .,very authentic indian cuisine <SEP> positive,Very authentic Indian cuisine .,restaurant,0.0
...,...,...,...,...,...,...,...
64339,staff,very friendly staff <SEP> positive,Staff was very friendly .,customer based staff <SEP> positive,The Staff were customer based .,hotel,1.0
64340,staff,nice accommodating <SEP> positive,"He was very accommodating , nice , and detailed .",good service <SEP> positive,keep up the good service .,hotel,0.0
64341,food -> quality,excellent rolls <SEP> positive,Lots of excellent rolls for under $ 6 .,authentic mexican food <SEP> positive,Authentic Mexican food .,restaurant,0.0
64342,food -> quality,terrible food <SEP> negative,Come here only if you want to pay $ 150 Canadian for terrible food .,poorly cooked food <SEP> negative,Poorly cooked food,restaurant,1.0


In [18]:
valid_df

Unnamed: 0,topic,comment,full_comment,keypoint,full_keypoint,domain,label
0,drink -> alcohol,very nice wine bar <SEP> positive,We ate in the wine bar behind the actual restaurant which was very nice and cozy .,disappointment beer sampler <SEP> negative,Beer sampler a disappointment,restaurant,0.0
1,wait-time,better detour <SEP> positive,Detour was better than any of these yelp reviews could rate !,very poor wifi <SEP> negative,WiFi was very poor also .,hotel,0.0
2,drink -> alcohol,extensive knowledge of wine list <SEP> positive,She had extensive knowledge of the wine list and the food menu and offered up great suggestions for combos .,hidden gem <SEP> positive,What a hidden gem .,restaurant,0.0
3,drink -> alcohol,out of this world infused cocktails <SEP> positive,The infused cocktails are out of this world ! ! ! !,impressive wine list <SEP> positive,Wine list is also impressive .,restaurant,0.0
4,drink -> alcohol,lovely cocktails <SEP> positive,We chose two lovely cocktails off their menu before ordering dinner .,disappointment beer sampler <SEP> negative,Beer sampler a disappointment,restaurant,0.0
...,...,...,...,...,...,...,...
4141,wait-time,24 hour room service <SEP> positive,24 hour room service ( life saver when my flight arrived at 2 am from the west coast and I was really hungry ) .,heavily congested elevators <SEP> negative,Elevators were heavily congested .,hotel,0.0
4142,drink -> alcohol,very nice wine list <SEP> positive,VERY nice wine list and decor .,great cocktails <SEP> positive,Great cocktails and wines .,restaurant,0.0
4143,drink -> alcohol,robust flavors <SEP> positive,wow the robust flavors .,great cocktails <SEP> positive,Great cocktails and wines .,restaurant,0.0
4144,drink -> alcohol,locally made ohio vodka <SEP> positive,I liked that they used locally made Ohio vodka from watershed distillery .,good bang for your buck <SEP> positive,good bang for your buck .,restaurant,0.0


In [19]:
training_df['label'] = training_df.label.apply(lambda x: int(x))
training_df[['comment', 'keypoint', 'domain', 'label']].to_csv('./training_material/siamese-data/training_df_contrastive.csv')

valid_df['label'] = valid_df.label.apply(lambda x: int(x))
valid_df[['comment', 'keypoint', 'domain', 'label']].to_csv('./training_material/siamese-data/valid_df_contrastive.csv')

## Create separate sets of comments/KPs

In [20]:
contrastive_datasets = datasets
contrastive_datasets

DatasetDict({
    train: Dataset({
        features: ['topic', 'comment', 'full_comment', 'keypoint', 'full_keypoint', 'domain', 'label'],
        num_rows: 64344
    })
    valid: Dataset({
        features: ['topic', 'comment', 'full_comment', 'keypoint', 'full_keypoint', 'domain', 'label'],
        num_rows: 4146
    })
})

In [21]:
contrastive_datasets = contrastive_datasets.rename_columns({'full_keypoint': 'full_key_point', 'keypoint': 'key_point'})

In [22]:
# Functions to label id
i = 0
curr_topic = ""
def label_group_id(grp_df):
    global i
    grp_df['group_id'] = i
    grp_df = grp_df.reset_index(drop=True)
    grp_df = grp_df.reset_index()
    i += 1
    return grp_df

def label_kp_id(grp_df):
    global i
    global curr_topic
    if grp_df['topic'].iloc[0] != curr_topic:
        i = 0
        curr_topic = grp_df['topic'].iloc[0]
    grp_df['kp_id'] = i
    grp_df = grp_df.reset_index(drop=True)
    i += 1
    return grp_df

def label_comment_id(grp_df):
    global i
    global curr_topic
    if grp_df['topic'].iloc[0] != curr_topic:
        i = 0
        curr_topic = grp_df['topic'].iloc[0]
    grp_df['comm_id'] = i
    grp_df = grp_df.reset_index(drop=True)
    i += 1
    return grp_df

In [23]:
def create_comment_kp_label_dfs(datasets, subset):
    dataset = datasets[subset].shuffle(seed=42)
    dataset.set_format("pandas")
    df = dataset[:]
    df = df.rename(columns={'text': 'comment'})
    
    global i
    i = 0
    df = df.groupby(['topic']).apply(label_group_id).reset_index(drop=True)  
    
    i = 0
    df = df.groupby(['topic', 'comment']).apply(label_comment_id).reset_index(drop=True)
    
    i = 0
    df = df.groupby(['topic', 'key_point']).apply(label_kp_id).reset_index(drop=True)
    
    df['comment_id'] = "comment_" + df['group_id'].astype(str) + "_" + df['comm_id'].astype(str)
    df['key_point_id'] = "kp_" + df['group_id'].astype(str) + "_" + df['kp_id'].astype(str)
    
    comments_df = df[:].sort_values(by=['group_id', 'comm_id'])\
        [['comment_id', 'comment', 'full_comment', 'topic', 'domain']].\
        drop_duplicates().reset_index(drop=True)
    
    keypoints_df = df[:].sort_values(by=['group_id', 'kp_id'])\
        [['key_point_id', 'key_point', 'full_key_point', 'topic', 'domain']].\
        drop_duplicates().reset_index(drop=True)
    
    labels_df = df[:].sort_values(by=['group_id', 'index'])[['comment_id', 'key_point_id', 'label']]
    labels_df['label'] = labels_df.label.apply(lambda x: int(x))
    
    return comments_df.reset_index(drop=True), keypoints_df.reset_index(drop=True), labels_df.reset_index(drop=True)

In [24]:
for subset, subset_save in zip(['train', 'valid'], ['train', 'dev']):
    comments_df, keypoints_df, labels_df = create_comment_kp_label_dfs(contrastive_datasets, subset)
    comments_df.to_csv(f"./training_material/siamese-data/comments_{subset_save}.csv", index=False)
    keypoints_df.to_csv(f"./training_material/siamese-data/key_points_{subset_save}.csv", index=False)
    labels_df.to_csv(f"./training_material/siamese-data/labels_{subset_save}.csv", index=False)

In [25]:
comments_df

Unnamed: 0,comment_id,comment,full_comment,topic,domain
0,comment_0_0,very champagne <SEP> positive,I like the champagne very friendly staff .,drink -> alcohol,hotel
1,comment_0_1,very martinis <SEP> positive,Drinks specials for martinis were very good .,drink -> alcohol,restaurant
2,comment_0_2,a little too strong cocktail <SEP> negative,"The cocktail was also a little too strong for my liking , but I can see other people enjoying it .",drink -> alcohol,restaurant
3,comment_0_3,above average wine list <SEP> positive,Wine list was above average .,drink -> alcohol,restaurant
4,comment_0_4,absolute knockout pineapple martini <SEP> positive,The Pineapple Martini was an absolute knockout ! ! !,drink -> alcohol,restaurant
...,...,...,...,...,...
658,comment_2_262,wonderful stayed <SEP> positive,"Stayed a week , and it was wonderful !",wait-time,hotel
659,comment_2_263,worst spend 50 <SEP> negative,51$. Worst spend 50 dollars of my life .,wait-time,restaurant
660,comment_2_264,worth visit <SEP> positive,Definitely worth a visit .,wait-time,restaurant
661,comment_2_265,worth wait <SEP> positive,So worth the wait ! !,wait-time,restaurant
