In [10]:
import pandas as pd
from collections import defaultdict
import re
import os

In [11]:
PAA_dir = "../data/PAA/"

In [12]:
df = pd.read_csv(os.path.join(PAA_dir, "PAAData.tsv"), delimiter="\t")

In [3]:
df.head()

Unnamed: 0,Query,Click Counts,PAA Questions,Dynamic Questions,Session Id,Session_ Page View Count,Request_ ImpressionGuid,Request_ Front Door Impression Event Id
0,how to clean eyeglasses that are cloudy,2,What is the best method for cleaning eyeglasse...,ClickedQ:Why are my eyeglasses always cloudy?_...,a44aa80c88bf7c7b28c95bd3f484273b,9,F9AECA1E68624BCB93FEE9BA29798588,8B1423B0B3834EF1B3FA22E9CC9E9978
1,descriptions of lucifer,1,Does the word 'lucifer' refer to Satan?__SUGGS...,"ClickedQ:Is ""Lucifer"" the Devil in Isaiah 14?_...",4bb28b7513e2e01c50d5835d0a2ecd4b,7,A6AC0D481DF64742B81771BC6A2A50C0,FFC3253CB63E461BB6CE77171E58D726
2,cubital release,1,What to expect with cubital tunnel release sur...,ClickedQ:What does cubital mean?__QUERYSEP__Wh...,4142113be451752481c22a54e09aa483,4,72C1DB0F6D8A4A94ADD55A106B5FC919,7F7D6FD3F2A04E9A80D777DE50D1AA8E
3,china aging population,1,Why does China have an aging population proble...,ClickedQ:What does China have an ageing popula...,86e8412bc9267dd65db76f9b49d872a0,46,27768DD68AF3472CBF3C93495D924AC8,5748BDF6BAD54A51A0F602848349FCE8
4,if your charger isn't working,2,Why is my phone charger not working?__SUGGSEP_...,ClickedQ:Why is my phone charger not working?_...,5225c0c34a6110a09f0c1c0247ad2f4b,117,7F153EBF06CA4983981C2D8CF9D4494F,895B60AB3E784EDF9F2DB6435F800387


In [4]:
print(df.columns)

Index(['Query', 'Click Counts', 'PAA Questions', 'Dynamic Questions',
       'Session Id', 'Session_ Page View Count', 'Request_ ImpressionGuid',
       'Request_ Front Door Impression Event Id'],
      dtype='object')


### Preprocess PAA sessions

In [5]:
sessions = defaultdict(list)

for i, row in df.iterrows():
    sess_key = row['Session Id']
    session_entries = row.to_dict()
    session_entries['PAA Questions'] = session_entries['PAA Questions'].split("__SUGGSEP__")
    
    # Interactions
    interactions = []
    next_interaction = session_entries['Dynamic Questions']
#     num_interactions = re.findall(r"__NEXT__", curr_interaction)
    for j in range(session_entries['Click Counts']):
        try:
            curr_interaction, next_interaction = next_interaction.split("__NEXT__", 1)
        except ValueError as e:
            curr_interaction = next_interaction
        clicked_query, follow_ups = curr_interaction.split("__QUERYSEP__", 1)
        interactions.append({
            'clicked_q': clicked_query.replace("ClickedQ:", ""),
            'follow_ups': follow_ups.split("__SUGGSEP__")
        })
    session_entries['interactions'] = interactions
    sessions[sess_key].append(session_entries)
            
#     while "__NEXT__" in next_interaction:

In [6]:
sessions['a44aa80c88bf7c7b28c95bd3f484273b']

[{'Query': 'how to clean eyeglasses that are cloudy',
  'Click Counts': 2,
  'PAA Questions': ['What is the best method for cleaning eyeglasses?',
   'Why are my eyeglasses always cloudy?',
   'How do you clean film off eyeglasses?',
   'What is the best way to clean plastic eyeglass lenses?'],
  'Dynamic Questions': 'ClickedQ:Why are my eyeglasses always cloudy?__QUERYSEP__Why do my contact lenses seem kind of cloudy?__SUGGSEP__Why are my new glasses blurry on one side?__NEXT__ClickedQ:How do you clean film off eyeglasses?__QUERYSEP__How often should I clean my eyeglasses?__SUGGSEP__How to clean clear film from eye?',
  'Session Id': 'a44aa80c88bf7c7b28c95bd3f484273b',
  'Session_ Page View Count': 9,
  'Request_ ImpressionGuid': 'F9AECA1E68624BCB93FEE9BA29798588',
  'Request_ Front Door Impression Event Id': '8B1423B0B3834EF1B3FA22E9CC9E9978',
  'interactions': [{'clicked_q': 'Why are my eyeglasses always cloudy?',
    'follow_ups': ['Why do my contact lenses seem kind of cloudy?',
 

### Create Dialogs

In [17]:
superset = []
pool = []
for k, sess_s in sessions.items():
    for sess in sess_s:
        conv = []
        conv.append(sess['Query'])
#         conv.append(" __comma__ ".join(sess['PAA Questions']))
        interactions = sess['interactions']
        for act in interactions:
            conv.append(act['clicked_q'])
#             conv.append(" __comma__ ".join(act['follow_ups']))
        superset.append(conv)
    
with open(os.path.join(PAA_dir, "train_dialogues.txt"), "w") as f:
    for line in superset:
        line = " __eou__ ".join(line) + " __eou__"
        print(line)
        f.write(line + "\n")

how to clean eyeglasses that are cloudy __eou__ Why are my eyeglasses always cloudy? __eou__ How do you clean film off eyeglasses? __eou__
descriptions of lucifer __eou__ Is "Lucifer" the Devil in Isaiah 14? __eou__
cubital release __eou__ What does cubital mean? __eou__
china aging population __eou__ What does China have an ageing population? __eou__
if your charger isn't working __eou__ Why is my phone charger not working? __eou__ Why is my phone not turning on at all? __eou__
conservatorship __eou__ What is a conservatorship, and how does it work? __eou__
definition day __eou__ What does day mean? __eou__ What does day mean in the Bible? __eou__
do i need a concerled gun permit in the state of iowa __eou__ What are the gun laws in Iowa? __eou__ What states honor Iowa concealed carry permit? __eou__
green tea benefits __eou__ What are the effects of drinking too much green tea? __eou__ What are some benefits of drinking green tea? __eou__ When to stop drinking green tea? __eou__ What

### Incorporating all system options

In [18]:
for k, sess_s in sessions.items():
    for sess in sess_s:
        conv = []
        conv.append(sess['Query'])
        conv.append(" __comma__ ".join(sess['PAA Questions']))
        interactions = sess['interactions']
        for act in interactions:
            conv.append(act['clicked_q'])
            conv.append(" __comma__ ".join(act['follow_ups']))
        print(" __eou__ ".join(conv) + "__eou__")

how to clean eyeglasses that are cloudy __eou__ What is the best method for cleaning eyeglasses? __comma__ Why are my eyeglasses always cloudy? __comma__ How do you clean film off eyeglasses? __comma__ What is the best way to clean plastic eyeglass lenses? __eou__ Why are my eyeglasses always cloudy? __eou__ Why do my contact lenses seem kind of cloudy? __comma__ Why are my new glasses blurry on one side? __eou__ How do you clean film off eyeglasses? __eou__ How often should I clean my eyeglasses? __comma__ How to clean clear film from eye?__eou__
descriptions of lucifer __eou__ Does the word 'lucifer' refer to Satan? __comma__ Why is Satan sometimes called Lucifer? __comma__ Is "Lucifer" the Devil in Isaiah 14? __comma__ What does the Bible say about Lucifers role in Heaven? __eou__ Is "Lucifer" the Devil in Isaiah 14? __eou__ Why is 'Lucifer' referred to as 'son of the morning'?__eou__
cubital release __eou__ What to expect with cubital tunnel release surgery? __comma__ What does cub