In [15]:
# Getting comments on posts where physicians have responded

import pandas as pd
import json

# Load high engagement posts from the selected posts CSV (assuming it's already saved)
df_selected_posts = pd.read_csv("selected_posts.csv")

# Read and collect comments
physician_comments = []
with open("AskDocs_comments.csv", "r") as f:
    for line in f:
        if not line:
            break
        comment = json.loads(line)
        
        # Check if the comment's link_id matches any post id from high engagement posts
        # Note: Reddit link_id format is 't3_<post_id>'
        if comment['link_id'][3:] in df_selected_posts['id'].values:
            # Check if the comment is made by a physician
            if comment.get('author_flair_text') == 'Physician':
                physician_comments.append(comment)

# Convert list of physician comments to DataFrame
df_physician_comments = pd.DataFrame(physician_comments)

# Save to CSV if needed
df_physician_comments.to_csv("physician_comments.csv", index=False)

# Display the first few entries to verify
df_physician_comments.head()


Unnamed: 0,author,link_id,subreddit,distinguished,subreddit_id,author_flair_css_class,archived,created_utc,parent_id,retrieved_on,...,ups,score_hidden,edited,gilded,id,author_flair_text,body,downs,removal_reason,name
0,murpahurp,t3_242w3w,AskDocs,,t5_2xtuc,verified-doc,True,1398591343,t3_242w3w,1433541050,...,9,False,False,0,ch379tz,Physician,You have been having anxiety attacks and you a...,0,,t1_ch379tz
1,driconoclast,t3_247m3y,AskDocs,,t5_2xtuc,verified-doc,True,1398741985,t3_247m3y,1433567945,...,2,False,False,0,ch4r881,Physician,Psychologists are usually pretty good diagnost...,0,,t1_ch4r881
2,driconoclast,t3_247m3y,AskDocs,,t5_2xtuc,verified-doc,True,1398817521,t1_ch5453h,1433581378,...,1,False,False,0,ch5j65m,Physician,hyperventilation syndrome and peripheral verti...,0,,t1_ch5j65m
3,driconoclast,t3_24qmjy,AskDocs,,t5_2xtuc,verified-doc,True,1399281200,t3_24qmjy,1433657280,...,3,False,False,0,ch9xk1n,Physician,This sounds like something a rheumatologist wo...,0,,t1_ch9xk1n
4,murpahurp,t3_24rtml,AskDocs,,t5_2xtuc,verified-doc,True,1399296569,t3_24rtml,1433658836,...,4,False,False,0,cha0prk,Physician,"Shingles can occur at any age, though indeed h...",0,,t1_cha0prk


In [16]:
df_physician_comments.columns

Index(['author', 'link_id', 'subreddit', 'distinguished', 'subreddit_id',
       'author_flair_css_class', 'archived', 'created_utc', 'parent_id',
       'retrieved_on', 'score', 'controversiality', 'ups', 'score_hidden',
       'edited', 'gilded', 'id', 'author_flair_text', 'body', 'downs',
       'removal_reason', 'name'],
      dtype='object')

In [17]:
df_selected_posts.columns

Index(['link_flair_css_class', 'gilded', 'name', 'id', 'archived',
       'author_flair_css_class', 'num_comments', 'saved', 'over_18', 'is_self',
       'from', 'author', 'stickied', 'from_id', 'secure_media', 'url',
       'subreddit_id', 'created', 'from_kind', 'score', 'edited', 'quarantine',
       'domain', 'hide_score', 'created_utc', 'author_flair_text',
       'media_embed', 'subreddit', 'title', 'secure_media_embed', 'downs',
       'thumbnail', 'distinguished', 'link_flair_text', 'permalink', 'ups',
       'retrieved_on', 'selftext', 'media', 'selftext_html', 'user_reports',
       'mod_reports', 'banned_by', 'report_reasons'],
      dtype='object')

In [19]:
df_physician_comments["clean_link_id"] = df_physician_comments["link_id"].str.replace("t3_", "", regex=False)

In [20]:
merged_df = df_physician_comments.merge(df_selected_posts, left_on="clean_link_id", right_on="id", suffixes=("_comment", "_post"))

In [25]:
merged_df.columns

Index(['author_comment', 'link_id', 'subreddit_comment',
       'distinguished_comment', 'subreddit_id_comment',
       'author_flair_css_class_comment', 'archived_comment',
       'created_utc_comment', 'parent_id', 'retrieved_on_comment',
       'score_comment', 'controversiality', 'ups_comment', 'score_hidden',
       'edited_comment', 'gilded_comment', 'id_comment',
       'author_flair_text_comment', 'body', 'downs_comment', 'removal_reason',
       'name_comment', 'clean_link_id', 'link_flair_css_class', 'gilded_post',
       'name_post', 'id_post', 'archived_post', 'author_flair_css_class_post',
       'num_comments', 'saved', 'over_18', 'is_self', 'from', 'author_post',
       'stickied', 'from_id', 'secure_media', 'url', 'subreddit_id_post',
       'created', 'from_kind', 'score_post', 'edited_post', 'quarantine',
       'domain', 'hide_score', 'created_utc_post', 'author_flair_text_post',
       'media_embed', 'subreddit_post', 'title', 'secure_media_embed',
       'downs_pos

In [27]:
df_physician_comments["link_id"]

0     t3_242w3w
1     t3_247m3y
2     t3_247m3y
3     t3_24qmjy
4     t3_24rtml
5     t3_24rtml
6     t3_2762v8
7     t3_27k4sw
8     t3_27k4sw
9     t3_27urbk
10    t3_283z7s
11    t3_283z7s
12    t3_29wbcs
13    t3_29wbcs
14    t3_29zkry
15    t3_2bwc65
16    t3_2bwc65
17    t3_2cxe62
18    t3_2d2bxp
19    t3_2cwqpo
20    t3_2cwqpo
21    t3_2cwqpo
22    t3_2e6rvt
23    t3_2ek0le
24    t3_2f332w
Name: link_id, dtype: object

In [28]:
df_selected_posts["id"]

0     2efzm2
1     23u7jp
2     27k4sw
3     1xqkls
4     2cwqpo
       ...  
95    2cqjob
96    2d0qbk
97    2d2bxp
98    2d6q35
99    2d9t7v
Name: id, Length: 100, dtype: object

In [33]:
merged_df[["id_comment", "clean_link_id", "id_post"]]

Unnamed: 0,id_comment,clean_link_id,id_post
0,ch379tz,242w3w,242w3w
1,ch4r881,247m3y,247m3y
2,ch5j65m,247m3y,247m3y
3,ch9xk1n,24qmjy,24qmjy
4,cha0prk,24rtml,24rtml
5,cha11t8,24rtml,24rtml
6,chxzcx6,2762v8,2762v8
7,ci1l54o,27k4sw,27k4sw
8,ci1lcja,27k4sw,27k4sw
9,ci53jlo,27urbk,27urbk


In [31]:
df_physician_comments[df_physician_comments["link_id"] == 't3_242w3w']

Unnamed: 0,author,link_id,subreddit,distinguished,subreddit_id,author_flair_css_class,archived,created_utc,parent_id,retrieved_on,...,score_hidden,edited,gilded,id,author_flair_text,body,downs,removal_reason,name,clean_link_id
0,murpahurp,t3_242w3w,AskDocs,,t5_2xtuc,verified-doc,True,1398591343,t3_242w3w,1433541050,...,False,False,0,ch379tz,Physician,You have been having anxiety attacks and you a...,0,,t1_ch379tz,242w3w


In [None]:
# TODO: figure out how to map responses gotten to post id

# Load the DataFrame containing the selected posts with their post IDs
df_selected_posts = pd.read_csv("selected_posts.csv")

# Create a dictionary mapping from title to post_id
title_to_post_id = df_selected_posts.set_index('title')['name'].to_dict()

# Load your existing prompt response DataFrames
df_prompt1_responses = pd.read_csv("prompt1_responses.csv")
df_prompt2_responses = pd.read_csv("prompt2_responses.csv")

# Map post_id using the title in each DataFrame
df_prompt1_responses['post_id'] = df_prompt1_responses['title'].map(title_to_post_id)
df_prompt2_responses['post_id'] = df_prompt2_responses['title'].map(title_to_post_id)

# Save the updated DataFrames
df_prompt1_responses.to_csv("prompt1_responses.csv", index=False)
df_prompt2_responses.to_csv("prompt2_responses.csv", index=False)