In [1]:
__author__ = "Juho Leinonen"
__copyright__ = "Copyright (c) 2020, Aalto Speech Research"
# Notebook to create the csv file.
# Imports
import pandas as pd
import random

# Combine messages following each other if by same sender
# Filter messages that are too short
# Pick from each group enough chats that there is total at least x good messages for r@k
# From those chats pick those x messages
# Other chats than those will go to train set
# For both sets create r@k set where every good line has 4 false choices from all sentences of that group
# that have at least 10 characters.
# save to csv. format sentence ¤ correct next sentence | false 1 | false 2 | ... | false k-1
#####################################
# ###################
######### Setting up the pipeline ######################
########################################################

# So the run can be replicated
SEED = 415
random.seed(SEED)

# Read the chat conversations to a dataframe
conversations_file = "../data/clean_data/finchat_200331.csv"    
conversations_df = pd.read_csv(conversations_file, sep=',', engine='python')

# Read the metadata to a dataframe:
metadata_file = "../data/clean_data/meta_data_200304.csv"    
metadata_df = pd.read_csv(metadata_file, sep=',', engine='python')

# For the for loop
max_chat_id = conversations_df['CHAT_ID'].max()

# CHOOSE THESE
desired_lines_from_staff = 100
desired_lines_from_students = 100
desired_lines_from_highschoolers = 100
desired_lines = [desired_lines_from_staff, desired_lines_from_students, desired_lines_from_highschoolers]

# CHOOSE THIS
recall_k = 10

print_testing = True

In [2]:
############################ CODE ############################

df_columns = ['CHAT_ID', 'SPEAKER_ID', 'TEXT', 'GOOD']
df_compressed = pd.DataFrame(columns=df_columns)
df_train = pd.DataFrame(columns=df_columns)
df_eval  = pd.DataFrame(columns=df_columns)

# Indices of the chats of different groups, choose only those that were not offtopic
staff_chat_idx = metadata_df.loc[(metadata_df['GROUP'] == 1) & (metadata_df['OFFTOPIC'] == '0')]['CHAT_ID'].unique().tolist()
students_chat_idx = metadata_df.loc[(metadata_df['GROUP'] == 2) & (metadata_df['OFFTOPIC'] == '0')]['CHAT_ID'].unique().tolist()
highschoolers_chat_idx = metadata_df.loc[(metadata_df['GROUP'] == 3) & (metadata_df['OFFTOPIC'] == '0')]['CHAT_ID'].unique().tolist()

group_idxs = [staff_chat_idx, students_chat_idx, highschoolers_chat_idx]

# Using a dictionary instead of list in case there are empty chat IDs
chat_lengths = {}
eval_chat_idx = []
train_chat_idx = []

In [3]:
# Going over all chats compressing them and marking good sentences
for i in range(max_chat_id + 1):
    
    df_current_chat = conversations_df.loc[conversations_df['CHAT_ID'] == i][['CHAT_ID','SPEAKER_ID', 'TEXT']]
    
    # In case the chat ID skips one or more
    if df_current_chat.empty:
        continue
    
    # Checking whether one person sent multiple messages in a row, then combines them with a " <MS> ".
    # Adds the chat ID back.
    adj_check = (df_current_chat['SPEAKER_ID'] != df_current_chat['SPEAKER_ID'].shift()).cumsum()
    
    df_current_chat = df_current_chat.groupby(['SPEAKER_ID', adj_check], as_index=False, sort=False).agg({'TEXT' : ' <MS> '.join})   
    df_current_chat.insert(0, 'CHAT_ID', i)

    # Filter short sentences.
    mask_of_good_sentences = list(df_current_chat['TEXT'].str.len() > 10)
    
    # If the next sentence is too short, and cannot be used, 
    # then it cannot be something to predict from this sentence
    # range 0 to the second last
    for j in range(len(mask_of_good_sentences) - 1):        
        if not mask_of_good_sentences[j+1]:
            mask_of_good_sentences[j] = False
            
    # The last sentence does not have a next sentence
    mask_of_good_sentences[-1] = False
    
    df_current_chat["GOOD"] = mask_of_good_sentences
    
    # How many good sentences and total sentences
    chat_lengths[i] = [df_current_chat.shape[0], sum(mask_of_good_sentences)]
    
    # Add to new compressed version of dataframe
    df_compressed = df_compressed.append(df_current_chat, ignore_index=True)


In [4]:
# Randomly collect chat indices with total lines of at least x from group
def collect_eval_lines(idxs, amount_of_lines):
    chat_idxs = []
    total_lines = 0
    idx_to_pick_from = [idx for idx in idxs]
    while total_lines < amount_of_lines:
        current_chat_id = random.choice(idx_to_pick_from)
        
        idx_to_pick_from.remove(current_chat_id)
        chat_idxs.append(current_chat_id)
        
        total_lines += chat_lengths[current_chat_id][1]
    return chat_idxs



for i in range(len(group_idxs)):
    chats_to_eval = collect_eval_lines(group_idxs[i], desired_lines[i])
    
    # Needs to be done chat by chat, otherwise orders them by index.
    # e.g., the chats picked were 30, 30, 30, 80. So everything from the first
    # three and then 10 from fourth, if all using isin(iterator) they would be
    # maybe 30, 80, 30, 30.
    sentences_so_far = 0
    for chat in chats_to_eval[:-1]:  
        df_to_eval = df_compressed.loc[df_compressed['CHAT_ID'] == chat]
        df_eval = df_eval.append(df_to_eval, ignore_index=True)
        sentences_so_far += chat_lengths[chat][1]
        eval_chat_idx.append(chat)
           
    df_to_eval = df_compressed.loc[df_compressed['CHAT_ID'] == chats_to_eval[-1]]
    
    rows_to_pick_from_last = 0
    for index, row in df_to_eval.iterrows():
        rows_to_pick_from_last += 1
        if row['GOOD']:
            sentences_so_far += 1
            if sentences_so_far == 100:
                break
    df_to_eval = df_to_eval.head(rows_to_pick_from_last + 1) #There is always a sentence after GOOD
    
    # Since the last line was split between chats it cannot be used
    df_to_eval.iat[-1, -1] = False
    df_eval = df_eval.append(df_to_eval, ignore_index=True)
    eval_chat_idx.append(chats_to_eval[-1])
    
  
df_to_train = df_compressed.loc[~df_compressed['CHAT_ID'].isin(eval_chat_idx)]
df_train = df_train.append(df_to_train, ignore_index=True) # Easier way to reset index I think


In [5]:
# Get topics of evaluation set
if print_testing:
    eval_set_topics = metadata_df.loc[metadata_df['CHAT_ID'].isin(eval_chat_idx)]['TOPIC'].value_counts()
    print("Evaluation set topics")
    print(eval_set_topics)
    
    print("Whole set topics")
    print("Staff")
    print(metadata_df.loc[metadata_df['CHAT_ID'].isin(group_idxs[0])]['TOPIC'].value_counts())
    print("Students")
    print(metadata_df.loc[metadata_df['CHAT_ID'].isin(group_idxs[1])]['TOPIC'].value_counts())
    print("Highschoolers")
    print(metadata_df.loc[metadata_df['CHAT_ID'].isin(group_idxs[2])]['TOPIC'].value_counts())
    
    #Check some values
    print("lines in evaluation set")
    print(len(df_eval))
    print("lines in train set")
    print(len(df_train))
    print("Lines eval + train")
    print(len(df_eval) + len(df_train))
    print("Lines in compressed set")
    print(len(df_compressed))
    
    # No chat accidentally in train and set both or neither
    eval_chats_set = set(df_eval['CHAT_ID'].unique())
    train_chats_set = set(df_train['CHAT_ID'].unique())
    compressed_chats_set = set(df_compressed['CHAT_ID'].unique())
    
    print("eval chats")
    print(eval_chats_set)
    
    print("train chats")
    print(train_chats_set)
    
    print("eval and train intersection")
    print(eval_chats_set.intersection(train_chats_set))
    
    print("chats not in eval or train")
    print(compressed_chats_set.difference(eval_chats_set.union(train_chats_set)))



Evaluation set topics
food          12
sports         8
music          6
literature     5
tv             4
traveling      4
Name: TOPIC, dtype: int64
Whole set topics
Staff
sports        22
traveling     14
movies        14
music          8
literature     8
food           8
tv             6
Name: TOPIC, dtype: int64
Students
food         10
traveling    10
Name: TOPIC, dtype: int64
Highschoolers
sports        15
tv            12
literature    10
Name: TOPIC, dtype: int64
lines in evaluation set
420
lines in train set
2020
Lines eval + train
2440
Lines in compressed set
2466
eval chats
{19, 20, 21, 22, 24, 31, 57, 61, 66, 70, 72, 78, 87, 88, 89, 90, 91, 92, 95}
train chats
{4, 6, 7, 9, 10, 13, 14, 15, 16, 17, 18, 23, 25, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 71, 73, 74, 75, 76, 77, 79, 80, 81, 83, 84, 86, 93, 94, 96}
eval and train intersection
set()
chats not in eval or

In [6]:
# Going over train and eval separately and randomly choosing for GOOD sentences
# The incorrect will be picked from not short sentences (except current and next).
# Will then be saved to a list with correct next sentence as first item.

# Dictionary used so the file can be written correctly as train or eval
df_dict = {"train" : df_train, "eval" : df_eval}

#If current sentence has BAD value True skipped. And not chosen randomly for false isNextSentence
for df_name, df_set in df_dict.items():
    df_set_len = df_set.shape[0]
    next_sentence_list = []
    indices_of_long_sentences = df_set.index[df_set["TEXT"].str.len() > 10].tolist()
    
    for index, row in df_set.iterrows():
        if row['GOOD']:
                already_picked = []
                bad_indices = {index, index + 1}
                choice_sentences = []
                
                # First picking k - 1 false choices, no sentence more than once.
                for _ in range(recall_k - 1):
                    bad_indices.update(already_picked)
                    false_sentence_index = random.choice([idx for idx in indices_of_long_sentences if idx not in bad_indices])
                    
                    choice_sentences.append(df_set.at[false_sentence_index, "TEXT"])
                    already_picked.append(false_sentence_index)
                
                choice_sentences.insert(0, df_set.at[index + 1, "TEXT"])
                
                next_sentence_list.append(" | ".join(choice_sentences))
        else:
            next_sentence_list.append("PASS")
    
    # Make the next sentence list into a dataframe so it will be easier to concatenate
    df_next_sentence = pd.DataFrame(next_sentence_list, columns=["CHOICE_SENTENCES"])
    
    # Concatenate, take only the good sentences, reset the index and save with only necessary columns
    # TODO find a unique separator, now ¤ works.
    df_new_set = pd.concat([df_set, df_next_sentence], axis=1, sort=False).query("GOOD").reset_index()[["TEXT", "CHOICE_SENTENCES"]]
    csv_name = "../data/" + df_name + "_topX_recall_at_" + str(recall_k) + "_" + str(SEED) + ".csv"
    df_new_set.to_csv(csv_name, index=False, sep='¤')

