In [1]:
import pandas as pd
import datetime as dt
import time
import requests

In [2]:
url = "https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1"

In [3]:
res = requests.get(url)

In [4]:
res.status_code

200

In [5]:
json_data = res.json()
json_data['data'][0]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'Tww_az',
 'author_flair_background_color': '#5c89c7',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_template_id': '41cc1230-ac7d-11e8-8473-0ed473d8dcfc',
 'author_flair_text': '[T1D 2019] [OmniPod] [Freestyle Libre]',
 'author_flair_text_color': 'light',
 'author_flair_type': 'text',
 'author_fullname': 't2_4slgmmy1',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1582131485,
 'domain': 'self.diabetes_t1',
 'full_link': 'https://www.reddit.com/r/diabetes_t1/comments/f6dklg/temp_basal_help/',
 'gildings': {},
 'id': 'f6dklg',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'li

In [6]:
def query_pushshift(subreddit, kind = 'submission', day_window = 1, n = 350):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full 

In [8]:
results = query_pushshift('diabetes_t1')

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=1d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=2d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=3d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=4d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=5d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=6d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=7d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=8d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=9d
Querying from: https://api.pushshift.io/reddit/search/s

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=79d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=80d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=81d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=82d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=83d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=84d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=85d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=86d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=87d
Querying from: https://api.pushshift.io/reddit

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=156d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=157d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=158d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=159d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=160d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=161d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=162d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=163d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=164d
Querying from: https://api.pushshift.

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=233d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=234d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=235d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=236d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=237d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=238d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=239d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=240d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=241d
Querying from: https://api.pushshift.

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=310d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=311d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=312d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=313d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=314d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=315d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=316d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=317d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=diabetes_t1&size=500&after=318d
Querying from: https://api.pushshift.

In [9]:
results_diabetes_t1 = results

In [10]:
results_diabetes_t1.shape

(4463, 9)

In [11]:
results_diabetes_t1 = results.drop_duplicates()

In [12]:
results_diabetes_t1.shape

(4463, 9)

In [13]:
results_diabetes_t1.to_csv('diabetes_t1.csv', index = False)

In [14]:
t1 = pd.read_csv('diabetes_t1.csv')

In [15]:
t1.shape

(4463, 9)

In [16]:
t1.isnull().sum()

title             0
selftext        180
subreddit         0
created_utc       0
author            0
num_comments      0
score             0
is_self           0
timestamp         0
dtype: int64

In [17]:
t1.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Parents with T1D kids on low carb diet. Any im...,Hi all. My son (7) is t1d and is on a low carb...,diabetes_t1,1582052715,midnightcaller,4,1,True,2020-02-18
1,HBA1C results (2 months since diagnosis),"Diagnosed with type 1 mid December 2019, admit...",diabetes_t1,1582054702,snazehhh,1,1,True,2020-02-18
2,Type 1 diabetes and mood swings,Does anyone here feel that their diabetes can ...,diabetes_t1,1582057153,b-lavender,5,1,True,2020-02-18
3,Bes countries to live in being diabetic,"hello, I wanted to know which are the best cou...",diabetes_t1,1582062986,tiagob94,0,1,True,2020-02-18
4,Best countries to live in being diabetic,"hello, I wanted to know which are the best cou...",diabetes_t1,1582063056,tiagob94,0,1,True,2020-02-18


In [18]:
all_text = []
selftexts = t1['selftext']

In [25]:
for text in selftexts:
    all_text.append(text)

In [31]:
all_text

["My son was just diagnosed with t1d and Its clear that supplies, and organization cost are going to be allot.\n\nWhen I had my own small sole proprietorship I was able to get things as a tax deduction or credit.\n\nI was wondering anyone had financial tricks or things to be on the look out for to save money.  From the day to day to the annual tax application.\n\nI have saved every recipet this far.\n\nI am planning to use my tax return and whatever I can this year to Max out my HSA in 2020.\n\nI figure buying alchol wipes in bulk, needles, and gause pads makes sense.  Is there like a super secret website that's amazing for that kinda stuff?",
 'So I\'ve recently had way to many set issues with Medtronic sets and my 670g.It\'s caused high blood sugars way more often than it should. Like A1C around 8.3 level highs every week. Ever since I took a break I\'ve gotten it down to 7.4. (6month interval) Should I even go back with numbers like that? The biggest issue I was having was the infam

In [30]:

wordlist = all_text

wordfreq = []
for w in wordlist:
    wordfreq.append(wordlist.count(w))

#print("String\n" + wordstring +"\n")
print("List\n" + str(wordlist) + "\n")
print("Frequencies\n" + str(wordfreq) + "\n")
print("Pairs\n" + str(list(zip(wordlist, wordfreq))))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

