In [1]:
#import necessary libraries
import requests
import json
import numpy as np
import pandas as pd
from csv import reader
import re

## Reading in PMID List and Preparing Data

In [36]:
#read in list of PMIDs from sample
PMIDlist = pd.read_csv('pmids-journal.csv')
PMIDlist

Unnamed: 0,PMID,PMCID,DOI,Journal,Total_Posts,Tweeters,Media,Videos,Facebook,Google_Plus,Reddit,Blogs,Altmetric_Score,URL
0,33052356,PMC7553173,10.1101/2020.10.07.20208231,medrxiv,33,33,,,,,,,19.250,http://dx.doi.org/10.1101/2020.10.07.20208231
1,33052353,PMC7553167,10.1101/2020.10.08.20209650,medrxiv,329,243,29.0,,,,,6.0,396.800,http://dx.doi.org/10.1101/2020.10.08.20209650
2,33052363,PMC7553188,10.1101/2020.10.02.20204859,medrxiv,6,6,,,,,,,2.250,http://dx.doi.org/10.1101/2020.10.02.20204859
3,33052360,PMC7553180,10.1101/2020.10.05.20206953,medrxiv,4,4,,,,,,,1.750,http://dx.doi.org/10.1101/2020.10.05.20206953
4,33052359,PMC7553179,10.1101/2020.10.07.20208488,medrxiv,32,30,1.0,,,,,,25.550,http://dx.doi.org/10.1101/2020.10.07.20208488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,32511426,PMC7217033,10.1101/2020.03.06.20032417,medrxiv,21,20,,,,,,,12.950,http://dx.doi.org/10.1101/2020.03.06.20032417
996,32511424,PMC7216959,10.1101/2020.02.21.20026328,medrxiv,117,111,,,,,,,66.150,http://dx.doi.org/10.1101/2020.02.21.20026328
997,32511423,PMC7216908,10.1101/2020.02.09.20021261,medrxiv,235,205,9.0,,,,,2.0,234.980,http://dx.doi.org/10.1101/2020.02.09.20021261
998,32511422,PMC7216848,10.1101/2020.01.28.20019224,medrxiv,86,67,7.0,,,,,1.0,106.622,http://dx.doi.org/10.1101/2020.01.28.20019224


In [32]:
#collects preprint server for each pmid. doesn't return medrxiv, only biorxiv. so all blank results are medrxiv.
def getJournal(pmid):
    noComment = 0
    journal = []
    try:
        requestLink = 'https://api.altmetric.com/v1/pmid/' + str(pmid)
        response = requests.get(requestLink)
        data = response.json()
        journal = data['journal']
    except : pass
    return journal


PMIDlist['Journal'] = PMIDlist['PMID'].apply(getJournal)
PMIDlist['Journal'] = PMIDlist['Journal'].astype(str)

In [3]:
#put all journals in lowercase before creating urls
PMIDlist['Journal'] = PMIDlist['Journal'].str.lower()

In [37]:
#create url using dois and journals
PMIDlist['URL'] = 'https://www.' + PMIDlist['Journal'].astype('str') + '.org/content/' + PMIDlist['DOI'].astype('str') + 'v1'
PMIDlist['Journal'] = PMIDlist['Journal'].replace({'biorxiv' : 'biorxivstage'})

## Collecting and Storing Comments

In [5]:
#function for getting request given a forum and link
def getDisqusComments (forum, link) :
    requestLink = 'https://disqus.com/api/3.0/threads/listPosts.json?forum=' + forum +'&thread=link%3a'+ link +'&api_key='+'UpxKr5429gnFZBeZgy1lzFyjD5BYu2DnQEukxi5KpdNHzBSYY0g1dtfrrBAIib3r'
    response = requests.get(requestLink)
    data = response.json()
    return data

In [6]:
#collect comments. errorCount variable tests to make sure all comments are found
results_list = []
errorCount = 0

for index, row in PMIDlist.iterrows():
    d = row.to_dict()
    json = getDisqusComments(d['Journal'], d['URL'])
    if json['code'] == 0 :
        list_prep = json['response'].copy()
        results_list.append(list_prep)
    else :
        errorCount += 1

In [7]:
errorCount

0

In [8]:
#empty dataframe for storing results
data = pd.DataFrame(columns=['author', 'canVote', 'createdAt', 'dislikes', 'editableUnitl', 'forum', 'id', 'isApproved',
                           'isAtFlagLimit', 'isDeleted', 'isDeletedByAuthor', 'isEdited', 'isFlagged', 'isHighlighted', 'isSpam',
                           'likes', 'media', 'message', 'moderationLabels', 'numReports', 'parent', 'points', 'raw_message',
                           'sb', 'thread'])

In [9]:
#iterate through list to place comments into dataframe
for i in results_list:
    if bool(i) == True:
        data = data.append(i, sort=False)
    else :
        pass

In [23]:
#view comment dataframe
data

Unnamed: 0,author,canVote,createdAt,dislikes,editableUnitl,forum,id,isApproved,isAtFlagLimit,isDeleted,...,media,message,moderationLabels,numReports,parent,points,raw_message,sb,thread,editableUntil
0,"{'username': 'jennifergoldfarb', 'about': '', ...",False,2020-10-21T01:15:29,0,,biorxivstage,5118418045,True,False,False,...,[],<p>I am a graduate student at Johns Hopkins an...,[],0,,0,I am a graduate student at Johns Hopkins and i...,False,8227042830,2020-10-28T01:15:29
0,"{'username': 'glipsnort', 'about': 'Geneticist...",False,2020-09-24T19:07:43,0,,medrxiv,5083844729,True,False,False,...,[],<p>The paper reports that Rh-positive blood ty...,[],0,,0,The paper reports that Rh-positive blood type ...,False,8213393965,2020-10-01T19:07:43
1,"{'username': 'disqus_qe356HFV5Z', 'about': '',...",False,2020-09-24T14:55:10,0,,medrxiv,5083489431,True,False,False,...,[],"<p>Enjoyable reading, instructive work, I war...",[links],0,,1,"Enjoyable reading, instructive work, I warmly...",False,8213393965,2020-10-01T14:55:10
0,"{'username': 'disqus_NgAANDJMma', 'about': '',...",False,2020-10-05T12:42:45,0,,biorxivstage,5098043003,True,False,False,...,[],"<p><a href=""https://disq.us/url?url=https%3A%2...",[links],0,,0,https://www.immunology.ox.ac.uk/covid-19/covid...,False,8211385649,2020-10-12T12:42:45
0,"{'username': 'disqus_9yn7Y6RUwm', 'about': '',...",False,2020-11-03T04:48:03,0,,biorxivstage,5135634790,True,False,False,...,[],<p>We have recently published the structural w...,[links],0,,0,We have recently published the structural work...,False,8209714272,2020-11-10T04:48:03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,"{'username': 'coniinthegarden', 'about': '', '...",False,2020-07-16T23:25:03,0,,medrxiv,4994780081,True,False,False,...,[],<p>It is still important as means of slowing t...,[],0,4906210750.0,0,It is still important as means of slowing the ...,False,7977087794,2020-07-23T23:25:03
1,"{'username': 'disqus_3eLBJohjJJ', 'about': '',...",False,2020-06-06T00:06:57,0,,medrxiv,4942740436,True,False,False,...,[],<p>Although I agree with you that it's pointle...,[],0,4906210750.0,0,Although I agree with you that it's pointless ...,False,7977087794,2020-06-13T00:06:57
2,"{'username': 'steve_gelfand', 'about': '', 'na...",False,2020-05-10T03:27:47,0,,medrxiv,4908001814,True,False,False,...,[],<p>It’s all about doing what you can to reduce...,[],0,4906210750.0,1,It’s all about doing what you can to reduce vi...,False,7977087794,2020-05-17T03:27:47
3,"{'username': 'sunrydz', 'about': '', 'name': '...",False,2020-05-08T16:24:34,0,,medrxiv,4906210750,True,False,False,...,[],<p>If this virus can survive in the air for so...,[],0,,2,If this virus can survive in the air for so lo...,False,7977087794,2020-05-15T16:24:34


In [86]:
#save comments to local folder
data.to_csv('Disqus-comments.csv', index = False)

## Collecting PMIDs to Reconnect Comments to Articles

In [31]:
#function for getting request given a forum and link
def getlinks (forum, thread) :
    requestLink = 'https://disqus.com/api/3.0/threads/details.json?forum=' + forum + '&thread=' + thread +'&api_key='+'jlac3sVtiBaBstx1LfetUFTjYr1sEvhdlLD9To3LEV6lpuVTcCnxX4LX8gaQfNOn'
    response = requests.get(requestLink)
    data = response.json()
    link = data['response']['link']
    return link

In [32]:
#testing function on one article
getlinks('medrxiv', '8213393965')

'https://www.medrxiv.org/content/10.1101/2020.09.22.20199125v1'

In [34]:
#use function to collect article links
data['article_link'] = np.vectorize(getlinks)(data['forum'], data['thread'])

In [75]:
#get DOI from Disqus article links so that we can merge back with PMIDs
#getDOI function removes all link content prior to DOI and the concluding period
#replaceV1 removes v1 from the end of the link

def getDOI(x) :
    doi = re.search(r'(?<=content/).*[^. ]', x)
    return doi.group(0)

data['DOI'] = data['article_link'].apply(getDOI)
def replaceV1 (x) :
    return x.replace('v1', '')
data['DOI'] = data['DOI'].apply(replaceV1)
data['DOI']

In [80]:
#Full outer join to merge original PMID list with comments
final_df = pd.merge(PMIDlist, data, left_on='DOI', right_on='DOI', how='outer')

In [83]:
#Preview final dataframe
final_df

Unnamed: 0,PMID,PMCID,DOI,Journal,Total_Posts,Tweeters,Media,Videos,Facebook,Google_Plus,...,message,moderationLabels,numReports,parent,points,raw_message,sb,thread,editableUntil,article_link
0,33052356,PMC7553173,10.1101/2020.10.07.20208231,medrxiv,33,33,,,,,...,,,,,,,,,,
1,33052353,PMC7553167,10.1101/2020.10.08.20209650,medrxiv,329,243,29.0,,,,...,,,,,,,,,,
2,33052363,PMC7553188,10.1101/2020.10.02.20204859,medrxiv,6,6,,,,,...,,,,,,,,,,
3,33052360,PMC7553180,10.1101/2020.10.05.20206953,medrxiv,4,4,,,,,...,,,,,,,,,,
4,33052359,PMC7553179,10.1101/2020.10.07.20208488,medrxiv,32,30,1.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1283,32511426,PMC7217033,10.1101/2020.03.06.20032417,medrxiv,21,20,,,,,...,,,,,,,,,,
1284,32511424,PMC7216959,10.1101/2020.02.21.20026328,medrxiv,117,111,,,,,...,,,,,,,,,,
1285,32511423,PMC7216908,10.1101/2020.02.09.20021261,medrxiv,235,205,9.0,,,,...,,,,,,,,,,
1286,32511422,PMC7216848,10.1101/2020.01.28.20019224,medrxiv,86,67,7.0,,,,...,,,,,,,,,,


In [84]:
#remove superfluous comments unrelated to this project
final_df = final_df.drop(['Total_Posts', 'Tweeters', 'Media', 'Videos', 'Facebook', 'Google_Plus', 'Altmetric_Score', 'dislikes', 'Reddit', 'Blogs', 'canVote', 'likes', 'media', 'moderationLabels', 'numReports', 'points', 'sb', 'editableUntil'], axis=1)

In [85]:
final_df

Unnamed: 0,PMID,PMCID,DOI,Journal,URL,author,createdAt,editableUnitl,forum,id,...,isDeletedByAuthor,isEdited,isFlagged,isHighlighted,isSpam,message,parent,raw_message,thread,article_link
0,33052356,PMC7553173,10.1101/2020.10.07.20208231,medrxiv,https://www.medrxiv.org/content/10.1101/2020.1...,,,,,,...,,,,,,,,,,
1,33052353,PMC7553167,10.1101/2020.10.08.20209650,medrxiv,https://www.medrxiv.org/content/10.1101/2020.1...,,,,,,...,,,,,,,,,,
2,33052363,PMC7553188,10.1101/2020.10.02.20204859,medrxiv,https://www.medrxiv.org/content/10.1101/2020.1...,,,,,,...,,,,,,,,,,
3,33052360,PMC7553180,10.1101/2020.10.05.20206953,medrxiv,https://www.medrxiv.org/content/10.1101/2020.1...,,,,,,...,,,,,,,,,,
4,33052359,PMC7553179,10.1101/2020.10.07.20208488,medrxiv,https://www.medrxiv.org/content/10.1101/2020.1...,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1283,32511426,PMC7217033,10.1101/2020.03.06.20032417,medrxiv,https://www.medrxiv.org/content/10.1101/2020.0...,,,,,,...,,,,,,,,,,
1284,32511424,PMC7216959,10.1101/2020.02.21.20026328,medrxiv,https://www.medrxiv.org/content/10.1101/2020.0...,,,,,,...,,,,,,,,,,
1285,32511423,PMC7216908,10.1101/2020.02.09.20021261,medrxiv,https://www.medrxiv.org/content/10.1101/2020.0...,,,,,,...,,,,,,,,,,
1286,32511422,PMC7216848,10.1101/2020.01.28.20019224,medrxiv,https://www.medrxiv.org/content/10.1101/2020.0...,,,,,,...,,,,,,,,,,


In [87]:
#save a local copy of dataframe as csv
final_df.to_csv('Disqus-comments-all-articles.csv', index = False)