In [1]:
import pandas as pd

# Custom
from processing import tag_utterances
from processing import load_sem_types
from processing import DataPipeline
pd.set_option('display.max_columns', 500) # more columns displayed at once

There is a lot that could be done with formatting the data for training conversations:

* Option 1: All responses are equal
    * Treat every thread as a conversation
    * Every comment in the thread as a response to the original AskDocs

## Option 1: All responses are equal

3 files to create:
* Thread conversations: Contains dialogue structure
* Thread lines: Contains actual text of each utterance
* Utterance metadata: Contains information about each utterance such as if the author was a clinician,moderator, data, score etc.

In [2]:
data_instance = DataPipeline(comments_path = '../data/reddit_comments_askDocs_2014_to_2018_03.gz',
                            posts_path = '../data/original_posts_under_askDocs_subreddit_id.gz')
df = data_instance.load_full_thread()

print('Count of threads')
df['is_thread_start'].value_counts()

Comments Table Shape: (557648, 24)
Posts table shape: (43615, 35)
30710
Final combined table shape: (139535, 28)
Count of threads


0.0    108825
1.0     30710
Name: is_thread_start, dtype: int64

In [32]:
test = df['author'].apply(lambda r: str(r))

0

In [4]:
len(df['link_id_short'].unique().tolist())

30711

In [56]:
posts = data_instance.load_posts()
posts.head()

Posts table shape: (43615, 34)


Unnamed: 0,domain,subreddit,selftext,saved,id,from_kind,gilded,from,stickied,title,num_comments,score,retrieved_on,over_18,thumbnail,subreddit_id,hide_score,link_flair_css_class,author_flair_css_class,downs,archived,is_self,from_id,permalink,name,created,url,author_flair_text,quarantine,author,created_utc,link_flair_text,ups,distinguished
0,self.AskDocs,AskDocs,\n Age: 28-32\n Sex: M\n Height: 6'\n...,False,3e50jf,,0,,False,Pain in pelvic floor / during arousal / urinat...,0,1,1440597166,False,self,t5_2xtuc,False,,default,0,False,True,,/r/AskDocs/comments/3e50jf/pain_in_pelvic_floo...,t3_3e50jf,1437528445,http://www.reddit.com/r/AskDocs/comments/3e50j...,This user has not yet been verified.,False,doctorplsrespond,1437524845,,1,
1,self.AskDocs,AskDocs,"27\nM\n6'0""\n170 lbs.\nWhite\n2+ weeks\nNorthe...",False,37x41u,,0,,False,What's wrong with me?,3,3,1440702925,False,self,t5_2xtuc,False,,default,0,False,True,,/r/AskDocs/comments/37x41u/whats_wrong_with_me/,t3_37x41u,1433045646,http://www.reddit.com/r/AskDocs/comments/37x41...,This user has not yet been verified.,False,dudeotd,1433042046,,3,
2,self.AskDocs,AskDocs,http://imgur.com/a/7g7qm\n\nI've had these war...,False,3j1td1,,0,,False,Warts on my fingers,1,1,1443153807,False,self,t5_2xtuc,False,,default,0,False,True,,/r/AskDocs/comments/3j1td1/warts_on_my_fingers/,t3_3j1td1,1441002958,https://www.reddit.com/r/AskDocs/comments/3j1t...,This user has not yet been verified.,False,SmellMyDirk,1440999358,,1,
3,self.AskDocs,AskDocs,"Hi /r/AskDocs\n\nI'm 24, female, 5'8, around 1...",False,2zalmm,,0,,False,Pulling my neck multiple times a day?,1,3,1440849814,False,self,t5_2xtuc,False,,default,0,False,True,,/r/AskDocs/comments/2zalmm/pulling_my_neck_mul...,t3_2zalmm,1426554355,http://www.reddit.com/r/AskDocs/comments/2zalm...,This user has not yet been verified.,False,DoDaMutt,1426550755,,3,
4,self.AskDocs,AskDocs,My SO is a 29 year old white male. No previous...,False,2xsrw1,,0,,False,"29/M Severe vomiting, diarrhea, and stomach cr...",1,1,1440875249,False,self,t5_2xtuc,False,,default,0,False,True,,/r/AskDocs/comments/2xsrw1/29m_severe_vomiting...,t3_2xsrw1,1425399776,http://www.reddit.com/r/AskDocs/comments/2xsrw...,This user has not yet been verified.,False,45MinutesOfRoadHead,1425399776,,1,


In [23]:
list_of_threads = df['link_id_short'].unique().tolist()


# loop through all threads
for thread in list_of_threads[:3]:
    print(thread)
    df_subset = df.loc[df['link_id_short']==thread]
    # assert there is one poster
    assert sum(df_subset['is_thread_start'].unique())
    
    thread_author = str(df_subset.loc[df_subset['is_thread_start']==1]['author'].unique()[0]).strip()
    print('thread_author:',thread_author)
    thread_title = df_subset.url.unique()[1].split('/')[-2]
    print('thread_title:',thread_title)
    
    print(df_subset.loc[df_subset['parent_id_short']==thread].author.unique())
    #df_thread = df.loc[df['link_id_short'] == thread
    # get the seperate comments in that thread
    
    print()
    print()

37o1az
thread_author: RissaWasTaken
thread_title: husband_deteriorating_before_my_eyes_doctors_at_a
(21, 28)
['kql' 'BrownIRL' 'Maysj18' 'fusepark' 'Bockabock' 'Medicine7' '[deleted]'
 'bigpandas' 'lilleboff' 'lurkERdoc' 'TuxPenguin1' 'ThoracicPark'
 'tetsugakusei' 'Ninnjawhisper' 'THE_WORST_CAT' 'fuckadownvote'
 'HolographicDonut' 'ifiwazatreeyouwldknw']

3exs68
thread_author: adenoma
thread_title: pleomorphic_adenoma_and_a_little_scared
(2, 28)
['KJTF' 'Senseismic']

399nb8
thread_author: apav
thread_title: chemical_sphincterotomy_botox_vs_lateral_internal
(1, 28)
['AtariBigby']



In [54]:
df_subset['is_thread_start']

1662      0.0
1663      0.0
63247     0.0
130437    0.0
231417    0.0
326683    0.0
326706    0.0
326708    0.0
370622    0.0
457032    0.0
457257    0.0
458748    0.0
459296    0.0
459301    0.0
459607    0.0
462764    0.0
464194    0.0
464227    0.0
464244    0.0
473529    0.0
476864    0.0
477759    0.0
478870    0.0
479824    0.0
480635    0.0
481682    0.0
483425    0.0
483931    0.0
485035    0.0
485071    0.0
         ... 
512055    0.0
512056    0.0
512057    0.0
512058    0.0
512059    0.0
512060    0.0
512061    0.0
512062    0.0
512063    0.0
512064    0.0
512065    0.0
512066    0.0
512067    0.0
512068    0.0
512069    0.0
512070    0.0
512071    0.0
512072    0.0
512326    0.0
513731    0.0
513732    0.0
515895    0.0
522742    0.0
522743    0.0
525109    0.0
530905    0.0
530907    0.0
530908    0.0
530916    0.0
530917    0.0
Name: is_thread_start, Length: 83, dtype: float64

In [None]:
df.distinguished.value_counts()