# Reddit Data Extraction

## Import Libraries

In [4]:
import sys
import pandas as pd
import json
import datetime
import re #regex
import requests #APIs
import praw #reddit API enhanced library
import ffn #stock data
from textblob import TextBlob
import nltk
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import tensorflow as tf
from tensorflow import keras #lstm
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sn
import numpy as np
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
import pickle

#!{sys.executable} -m pip install praw
#!{sys.executable} -m pip install textblob
#!{sys.executable} -m pip install ffn
#!{sys.executable} -m pip install tensorflow
# Source: https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/

## Collect Data from Reddit

### Pull Post Data r/stocks and r/wallstreetbets

### Limited to Daily Discussion Posts

In [7]:
#function to get data from pushshift api
def getPushshiftData(query, after, before, sub):
    url = 'https://api.pushshift.io/reddit/search/submission/?title='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']


#Source: https://medium.com/analytics-vidhya/sentiment-analysis-for-trading-with-reddit-text-data-73729c931d01

In [8]:
#get relevant data from data extracted using previous function
def collectSubData(subm):
    subData = [subm['id'], subm['title'], subm['url'], datetime.datetime.fromtimestamp(subm['created_utc']).date()]
    try:
        flair = subm['link_flair_text']
    except KeyError:
        flair = "NaN"
    subData.append(flair)
    subStats.append(subData)

### r/stocks

In [6]:
#Subreddit to query
sub='stocks'

#before and after dates
#start with smaller dataset to make processing easier

# Used the earliest date that Daily Discussions became consistent (matched all stocks trading days)
after = '1528610400' #6/10/2018
#after = '1262329200' #1/1/2010
#after = '1577862000' #1/1/2020
#after = "1498867200" #july 10 2020
#before = '1616220000' #3/20/2021
#before = '1580540400' #2/1/2020
#before = "1594339200" #july 1 2017
before = '1618380000' #4/14/2021


#query string
query = "Daily Discussion"
#query = ''
subCount = 0
subStats = []

In [7]:
data = getPushshiftData(query, after, before, sub)
# Will run until all posts have been gathered 
# from the 'after' date up until before date

https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1528610400&before=1618380000&subreddit=stocks


In [8]:
data

[{'author': '_taeyeon',
  'author_flair_css_class': None,
  'author_flair_richtext': [],
  'author_flair_text': None,
  'author_flair_type': 'text',
  'can_mod_post': False,
  'contest_mode': False,
  'created_utc': 1528722573,
  'domain': 'self.stocks',
  'full_link': 'https://www.reddit.com/r/stocks/comments/8q9egq/what_happened_to_the_daily_stock_discussion_thread/',
  'gilded': 0,
  'id': '8q9egq',
  'is_crosspostable': True,
  'is_original_content': False,
  'is_reddit_media_domain': False,
  'is_self': True,
  'is_video': False,
  'link_flair_richtext': [],
  'link_flair_text_color': 'dark',
  'link_flair_type': 'text',
  'locked': False,
  'media_only': False,
  'no_follow': True,
  'num_comments': 16,
  'num_crossposts': 0,
  'over_18': False,
  'parent_whitelist_status': 'all_ads',
  'permalink': '/r/stocks/comments/8q9egq/what_happened_to_the_daily_stock_discussion_thread/',
  'pinned': False,
  'pwls': 6,
  'retrieved_on': 1528722574,
  'rte_mode': 'markdown',
  'score': 55,

In [9]:
while len(data) > 0:
    for submission in data:
        collectSubData(submission)
        subCount+=1
    # Calls getPushshiftData() with the created date of the last submission
    print(len(data))
    print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
    after = data[-1]['created_utc']
    data = getPushshiftData(query, after, before, sub)

#Source: https://rareloot.medium.com/using-pushshifts-api-to-extract-reddit-submissions-fb517b286563

99
2018-10-25 05:06:57
https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1540465617&before=1618380000&subreddit=stocks
100
2019-03-15 06:09:10
https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1552651750&before=1618380000&subreddit=stocks
100
2019-08-02 02:07:13
https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1564733233&before=1618380000&subreddit=stocks
100
2019-12-24 01:06:44
https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1577174804&before=1618380000&subreddit=stocks
100
2020-05-12 02:07:21
https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1589270841&before=1618380000&subreddit=stocks
100
2020-09-30 02:06:17
https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1601453177&before=1618380000&subreddit=stocks
100
2021-02-24 03:30:12
https://api.pushs

In [10]:
subStats

[['8q9egq',
  'What happened to the daily stock discussion thread?',
  'https://www.reddit.com/r/stocks/comments/8q9egq/what_happened_to_the_daily_stock_discussion_thread/',
  datetime.date(2018, 6, 11),
  'NaN'],
 ['8qio6f',
  'Daily discussion - r/Stocks Tuesday - Jun 12, 2018',
  'https://www.reddit.com/r/stocks/comments/8qio6f/daily_discussion_rstocks_tuesday_jun_12_2018/',
  datetime.date(2018, 6, 12),
  'NaN'],
 ['8qrkat',
  'Daily discussion - r/Stocks Wednesday - Jun 13, 2018',
  'https://www.reddit.com/r/stocks/comments/8qrkat/daily_discussion_rstocks_wednesday_jun_13_2018/',
  datetime.date(2018, 6, 13),
  'NaN'],
 ['8r0yzs',
  'Daily discussion - r/Stocks Thursday - Jun 14, 2018',
  'https://www.reddit.com/r/stocks/comments/8r0yzs/daily_discussion_rstocks_thursday_jun_14_2018/',
  datetime.date(2018, 6, 14),
  'NaN'],
 ['8ra3vn',
  'Daily discussion - r/Stocks Friday - Jun 15, 2018',
  'https://www.reddit.com/r/stocks/comments/8ra3vn/daily_discussion_rstocks_friday_jun_15_20

In [11]:
#organize data into dataframe
df={}
ids=[]
titles=[]
urls=[]
dates=[]
flairs=[]
for stat in subStats:
    ids.append(stat[0])
    titles.append(stat[1])
    urls.append(stat[2])
    dates.append(stat[3])
    flairs.append(stat[4])
df['id']=ids
df['title']=titles
df['url']=urls
df['date']=dates
df['flair']=flairs

In [12]:
#Convert to Pandas dataframe
df_stocks=pd.DataFrame(df)

### r/stocks ALL

In [13]:
#Subreddit to query
sub='stocks'

#before and after dates
#start with smaller dataset to make processing easier

# Used the earliest date that Daily Discussions became consistent (matched all stocks trading days)
after = '1528610400' #6/10/2018
#after = '1262329200' #1/1/2010
#after = '1577862000' #1/1/2020
#after = "1498867200" #july 10 2020
#before = '1616220000' #3/20/2021
#before = '1580540400' #2/1/2020
#before = "1594339200" #july 1 2017
before = '1618380000' #4/14/2021


#query string
query = ''
#query = ''
subCount = 0
subStats = []

In [14]:
data = getPushshiftData(query, after, before, sub)
# Will run until all posts have been gathered 
# from the 'after' date up until before date

https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1528610400&before=1618380000&subreddit=stocks


In [15]:
data

[{'author': 'thisaintausernamebro',
  'author_flair_css_class': None,
  'author_flair_richtext': [],
  'author_flair_text': None,
  'author_flair_type': 'text',
  'can_mod_post': False,
  'contest_mode': False,
  'created_utc': 1528610983,
  'domain': 'self.stocks',
  'full_link': 'https://www.reddit.com/r/stocks/comments/8pz01z/kodak_stocks/',
  'gilded': 0,
  'id': '8pz01z',
  'is_crosspostable': True,
  'is_original_content': False,
  'is_reddit_media_domain': False,
  'is_self': True,
  'is_video': False,
  'link_flair_richtext': [],
  'link_flair_text_color': 'dark',
  'link_flair_type': 'text',
  'locked': False,
  'media_only': False,
  'no_follow': True,
  'num_comments': 15,
  'num_crossposts': 0,
  'over_18': False,
  'parent_whitelist_status': 'all_ads',
  'permalink': '/r/stocks/comments/8pz01z/kodak_stocks/',
  'pinned': False,
  'pwls': 6,
  'retrieved_on': 1528610983,
  'rte_mode': 'richtext',
  'score': 4,
  'selftext': "I bandwagoned onto Kodak stocks when the crypto h

In [16]:
while len(data) > 0:
    for submission in data:
        collectSubData(submission)
        subCount+=1
    # Calls getPushshiftData() with the created date of the last submission
    print(len(data))
    print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
    after = data[-1]['created_utc']
    data = getPushshiftData(query, after, before, sub)

#Source: https://rareloot.medium.com/using-pushshifts-api-to-extract-reddit-submissions-fb517b286563

100
2018-06-12 11:42:11
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1528825331&before=1618380000&subreddit=stocks
100
2018-06-13 20:29:26
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1528943366&before=1618380000&subreddit=stocks
100
2018-06-15 07:12:14
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1529068334&before=1618380000&subreddit=stocks
100
2018-06-17 15:55:06
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1529272506&before=1618380000&subreddit=stocks
100
2018-06-19 08:41:16
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1529419276&before=1618380000&subreddit=stocks
100
2018-06-20 12:13:20
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1529518400&before=1618380000&subreddit=stocks
100
2018-06-22 08:04:04
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1529676244&before=1618380000&subreddit

100
2018-09-21 01:52:39
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1537516359&before=1618380000&subreddit=stocks
100
2018-09-23 07:38:52
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1537709932&before=1618380000&subreddit=stocks
100
2018-09-25 08:39:59
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1537886399&before=1618380000&subreddit=stocks
100
2018-09-26 12:07:16
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1537985236&before=1618380000&subreddit=stocks
100
2018-09-27 15:15:54
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1538082954&before=1618380000&subreddit=stocks
100
2018-09-29 07:47:35
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1538228855&before=1618380000&subreddit=stocks
100
2018-10-01 10:25:22
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1538411122&before=1618380000&subreddit

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
subStats

In [None]:
#organize data into dataframe
df={}
ids=[]
titles=[]
urls=[]
dates=[]
flairs=[]
for stat in subStats:
    ids.append(stat[0])
    titles.append(stat[1])
    urls.append(stat[2])
    dates.append(stat[3])
    flairs.append(stat[4])
df['id']=ids
df['title']=titles
df['url']=urls
df['date']=dates
df['flair']=flairs

In [None]:
#Convert to Pandas dataframe
df_stocks=pd.DataFrame(df)

### r/wallstreetbets

In [13]:
#Subreddit to query
sub='wallstreetbets'

#before and after dates
#start with smaller dataset to make processing easier


# Used the earliest date that Daily Discussions became consistent (matched all stocks trading days)
after = '1512889200' #12/10/2017
#after = '1262329200' #1/1/2010
#after = '1577862000' #1/1/2020
#after = "1498867200" #july 10 2020
#before = '1616220000' #3/20/2021
#before = '1580540400' #2/1/2020
#before = "1594339200" #july 1 2017
before = '1618380000' #4/14/2021



#query string
query = "Daily Discussion"
#query = ''
subCount = 0
subStats = []


In [14]:
data = getPushshiftData(query, after, before, sub)
# Will run until all posts have been gathered 
# from the 'after' date up until before date

https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1512889200&before=1618380000&subreddit=wallstreetbets


In [15]:
data

[{'author': 'AutoModerator',
  'author_flair_css_class': None,
  'author_flair_text': "I've come to terminate your stupid asses",
  'brand_safe': False,
  'can_mod_post': False,
  'contest_mode': False,
  'created_utc': 1512936319,
  'domain': 'self.wallstreetbets',
  'full_link': 'https://www.reddit.com/r/wallstreetbets/comments/7iwcux/daily_discussion_thread_december_10_2017/',
  'id': '7iwcux',
  'is_crosspostable': True,
  'is_reddit_media_domain': False,
  'is_self': True,
  'is_video': False,
  'locked': False,
  'num_comments': 30,
  'num_crossposts': 0,
  'over_18': False,
  'parent_whitelist_status': 'promo_specified',
  'permalink': '/r/wallstreetbets/comments/7iwcux/daily_discussion_thread_december_10_2017/',
  'pinned': False,
  'retrieved_on': 1512952490,
  'score': 1,
  'selftext': 'Trading discussion only. No memeing or shitposting.',
  'spoiler': False,
  'stickied': False,
  'subreddit': 'wallstreetbets',
  'subreddit_id': 't5_2th52',
  'subreddit_type': 'public',
  't

In [16]:
while len(data) > 0:
    for submission in data:
        collectSubData(submission)
        subCount+=1
    # Calls getPushshiftData() with the created date of the last submission
    print(len(data))
    print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
    after = data[-1]['created_utc']
    data = getPushshiftData(query, after, before, sub)

#Source: https://rareloot.medium.com/using-pushshifts-api-to-extract-reddit-submissions-fb517b286563

100
2018-04-26 06:07:27
https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1524744447&before=1618380000&subreddit=wallstreetbets
99
2018-09-04 05:10:41
https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1536059441&before=1618380000&subreddit=wallstreetbets
100
2019-01-10 04:05:56
https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1547118356&before=1618380000&subreddit=wallstreetbets
100
2019-05-29 05:05:46
https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1559127946&before=1618380000&subreddit=wallstreetbets
100
2019-09-23 05:05:49
https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1569236749&before=1618380000&subreddit=wallstreetbets
100
2020-01-29 04:05:40
https://api.pushshift.io/reddit/search/submission/?title=Daily Discussion&size=1000&after=1580295940&before=1618380000&subreddit=wallstre

In [17]:
data

[]

In [18]:
subStats

[['7iwcux',
  'Daily Discussion Thread - December 10, 2017',
  'https://www.reddit.com/r/wallstreetbets/comments/7iwcux/daily_discussion_thread_december_10_2017/',
  datetime.date(2017, 12, 10),
  'NaN'],
 ['7j4u75',
  'Daily Discussion Thread - December 11, 2017',
  'https://www.reddit.com/r/wallstreetbets/comments/7j4u75/daily_discussion_thread_december_11_2017/',
  datetime.date(2017, 12, 11),
  'NaN'],
 ['7jjwwy',
  'Daily discussion - Dec 13, 2017',
  'https://www.reddit.com/r/wallstreetbets/comments/7jjwwy/daily_discussion_dec_13_2017/',
  datetime.date(2017, 12, 13),
  'NaN'],
 ['7jr634',
  'Daily Discussion Thread - December 14, 2017',
  'https://www.reddit.com/r/wallstreetbets/comments/7jr634/daily_discussion_thread_december_14_2017/',
  datetime.date(2017, 12, 14),
  'NaN'],
 ['7kktzz',
  'Daily Discussion Thread - December 18, 2017',
  'https://www.reddit.com/r/wallstreetbets/comments/7kktzz/daily_discussion_thread_december_18_2017/',
  datetime.date(2017, 12, 18),
  'NaN'],

In [19]:
#organize data into dataframe
df={}
ids=[]
titles=[]
urls=[]
dates=[]
flairs=[]
for stat in subStats:
    ids.append(stat[0])
    titles.append(stat[1])
    urls.append(stat[2])
    dates.append(stat[3])
    flairs.append(stat[4])
df['id']=ids
df['title']=titles
df['url']=urls
df['date']=dates
df['flair']=flairs


In [20]:
df

{'id': ['7iwcux',
  '7j4u75',
  '7jjwwy',
  '7jr634',
  '7kktzz',
  '7ksv7x',
  '7l0zzr',
  '7l91u9',
  '7lh1wl',
  '7m13b4',
  '7m7fzl',
  '7menz4',
  '7mm8cc',
  '7mtyqi',
  '7nev10',
  '7nm292',
  '7nu8sj',
  '7o2kg6',
  '7oatdl',
  '7oy4eb',
  '7p6oki',
  '7pf6cm',
  '7pnnli',
  '7pw6fg',
  '7qjasf',
  '7qrw4i',
  '7r0jl2',
  '7r99l7',
  '7rhw7t',
  '7s5cax',
  '7sdy8o',
  '7smn81',
  '7svn7a',
  '7t43ou',
  '7trvhx',
  '7u0kcw',
  '7u9g26',
  '7ubuvm',
  '7uiawd',
  '7uqvjt',
  '7vecr1',
  '7vmweu',
  '7vvnod',
  '7w4c5z',
  '7wd0fy',
  '7x0bc3',
  '7x8y1b',
  '7xhjtc',
  '7xpxiq',
  '7xyb5q',
  '7yft0v',
  '7ym7z5',
  '7yvejt',
  '7z52oy',
  '7zef7c',
  '7znl4m',
  '80ckxk',
  '80lxr9',
  '80vjd2',
  '814sc9',
  '81dr8h',
  '825hoy',
  '82ekl0',
  '82nsa8',
  '82wzxk',
  '8367pf',
  '843mgr',
  '84cv0g',
  '84lxvg',
  '84uxbz',
  '85iyz5',
  '85s66u',
  '861jva',
  '86azav',
  '86k42c',
  '878lox',
  '87hva7',
  '87rd4d',
  '880q0n',
  '889utl',
  '88ytky',
  '89d1al',
  '89ovjd'

In [21]:
#Convert to Pandas dataframe
df_wsb=pd.DataFrame(df)
#df=df[df['flair']=='Daily Discussion']

### Pull Comments Data

In [24]:
#connect to reddit api
reddit_api = praw.Reddit(client_id='yW9lsTSpciuOoQ',
                         client_secret='QX9zJB-xVV98WGbhOYQQyCm-xwsjOQ',
                         user_agent='sentiment_analysis')


In [25]:
df_stocks = df_stocks.drop_duplicates(subset=['date'])

In [26]:
df_stocks

Unnamed: 0,id,title,url,date,flair
0,8q9egq,What happened to the daily stock discussion th...,https://www.reddit.com/r/stocks/comments/8q9eg...,2018-06-11,
1,8qio6f,"Daily discussion - r/Stocks Tuesday - Jun 12, ...",https://www.reddit.com/r/stocks/comments/8qio6...,2018-06-12,
2,8qrkat,Daily discussion - r/Stocks Wednesday - Jun 13...,https://www.reddit.com/r/stocks/comments/8qrka...,2018-06-13,
3,8r0yzs,"Daily discussion - r/Stocks Thursday - Jun 14,...",https://www.reddit.com/r/stocks/comments/8r0yz...,2018-06-14,
4,8ra3vn,"Daily discussion - r/Stocks Friday - Jun 15, 2018",https://www.reddit.com/r/stocks/comments/8ra3v...,2018-06-15,
...,...,...,...,...,...
716,mkgpbr,"r/Stocks Daily Discussion Monday - Apr 05, 2021",https://www.reddit.com/r/stocks/comments/mkgpb...,2021-04-05,
717,ml7rbq,r/Stocks Daily Discussion &amp; Technicals Tue...,https://www.reddit.com/r/stocks/comments/ml7rb...,2021-04-06,
718,mlycbf,"r/Stocks Daily Discussion Wednesday - Apr 07, ...",https://www.reddit.com/r/stocks/comments/mlycb...,2021-04-07,
719,mmo2go,r/Stocks Daily Discussion &amp; Options Tradin...,https://www.reddit.com/r/stocks/comments/mmo2g...,2021-04-08,


In [42]:
#collect stocks comments using praw
comments_by_day_stocks=[]
for url in df_stocks['url'].tolist():
    submission = reddit_api.submission(url=url)
    submission.comments.replace_more(limit=0)
    comments=list([(comment.body) for comment in submission.comments])
    comments_by_day_stocks.append(comments)

In [43]:
comments_by_day_stocks

[['Well that was u/bigbear0083 post.\n\nI was thinking of dropping a daily discussion thread every morning at like 8am, auto scheduled.\n\nThe mods discussed it awhile ago, we went with a quarterly thread.  But if users here want a serious stock focused daily discussion instead of the ones they get on other subs, then yeah let me know.\n\nEDIT OK [first one here.](https://www.reddit.com/r/stocks/comments/8qio6f/daily_discussion_rstocks_tuesday_jun_12_2018/)',
  "Maybe not exactly what you're looking for, but wallstreetbets does a daily discussion thread that has a good amount of content, and if you don't mind some jokes here and there it is a good way to gauge market sentiments on a day-to-day basis. I think this sub is too long-term focused for a daily thread to have quality content every day of the week.",
  "oh man! i'm very late to see this thread...didn't even know about it if not for the mentions by /u/lykosen11! thank you sir! :D \n\nthx so much for the kudos and shout outs in h

In [46]:
# Save data in a format that works for lists
with open('listfile_stocks.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(comments_by_day_stocks, filehandle)

In [47]:
df_wsb = df_wsb.drop_duplicates(subset=['date'])

In [48]:
#collect wsb comments using praw
comments_by_day_wsb=[]
for url in df_wsb['url'].tolist():
    try:
        submission = reddit_api.submission(url=url)
        submission.comments.replace_more(limit=0)
        comments=list([(comment.body) for comment in submission.comments])
        
    except:
        comments=None
    comments_by_day_wsb.append(comments)


In [49]:
comments_by_day_wsb[1:10]

[['Is the clock on this bot fucked up, or what?',
  'JD WAS GREEN MOTHAFUCKAS!!!',
  "BA raises dividend 20% to $1.71 in after hours news.\n\nOpening a 285/290 bull call spread on this morning's dip will be looking good tomorrow.",
  'Why the hell did this sub go private last night?',
  '[deleted]',
  'yeah guys for real, this is serious conversation time',
  'Overstock portfolio up 101%'],
 ["Why has the day come where I'm baggholding NVDA and AMD is green",
  'Did you guys hear how robinhood has free options trading?! Oh boy, this sub is about to be in A LOT of trouble.',
  "What's the deal with the days where MU is up like 3% at the beginning of the day and then slowly falls?",
  "JD i knew you'd come through for me",
  '6 days until $MU ER and RH announces free options trading, hope my family is cool w/ margin calls for Christmas. \n\nBut in all seriousness, I’m trying to figure out how to play this and it’s giving me an aneurism ',
  'Anyone else bagholding AMD from 13ish and want

In [52]:
# Save data in a format that works for lists
with open('listfile_wsb.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(comments_by_day_wsb, filehandle)

In [53]:
df_wsb

Unnamed: 0,id,title,url,date,flair
0,7iwcux,"Daily Discussion Thread - December 10, 2017",https://www.reddit.com/r/wallstreetbets/commen...,2017-12-10,
1,7j4u75,"Daily Discussion Thread - December 11, 2017",https://www.reddit.com/r/wallstreetbets/commen...,2017-12-11,
2,7jjwwy,"Daily discussion - Dec 13, 2017",https://www.reddit.com/r/wallstreetbets/commen...,2017-12-13,
3,7jr634,"Daily Discussion Thread - December 14, 2017",https://www.reddit.com/r/wallstreetbets/commen...,2017-12-14,
4,7kktzz,"Daily Discussion Thread - December 18, 2017",https://www.reddit.com/r/wallstreetbets/commen...,2017-12-18,
...,...,...,...,...,...
1082,mkh463,"Daily Discussion Thread for April 05, 2021",https://www.reddit.com/r/wallstreetbets/commen...,2021-04-05,Daily Discussion
1084,ml86d3,"Daily Discussion Thread for April 06, 2021",https://www.reddit.com/r/wallstreetbets/commen...,2021-04-06,Daily Discussion
1086,mlyq1y,"Unpinned Daily Discussion Thread for April 07,...",https://www.reddit.com/r/wallstreetbets/commen...,2021-04-07,Discussion
1088,mmog61,"Unpinned Daily Discussion Thread for April 08,...",https://www.reddit.com/r/wallstreetbets/commen...,2021-04-08,Discussion


In [54]:
df_wsb.to_csv('daily_disc_raw_wsb.csv')

In [55]:
df_stocks

Unnamed: 0,id,title,url,date,flair
0,8q9egq,What happened to the daily stock discussion th...,https://www.reddit.com/r/stocks/comments/8q9eg...,2018-06-11,
1,8qio6f,"Daily discussion - r/Stocks Tuesday - Jun 12, ...",https://www.reddit.com/r/stocks/comments/8qio6...,2018-06-12,
2,8qrkat,Daily discussion - r/Stocks Wednesday - Jun 13...,https://www.reddit.com/r/stocks/comments/8qrka...,2018-06-13,
3,8r0yzs,"Daily discussion - r/Stocks Thursday - Jun 14,...",https://www.reddit.com/r/stocks/comments/8r0yz...,2018-06-14,
4,8ra3vn,"Daily discussion - r/Stocks Friday - Jun 15, 2018",https://www.reddit.com/r/stocks/comments/8ra3v...,2018-06-15,
...,...,...,...,...,...
716,mkgpbr,"r/Stocks Daily Discussion Monday - Apr 05, 2021",https://www.reddit.com/r/stocks/comments/mkgpb...,2021-04-05,
717,ml7rbq,r/Stocks Daily Discussion &amp; Technicals Tue...,https://www.reddit.com/r/stocks/comments/ml7rb...,2021-04-06,
718,mlycbf,"r/Stocks Daily Discussion Wednesday - Apr 07, ...",https://www.reddit.com/r/stocks/comments/mlycb...,2021-04-07,
719,mmo2go,r/Stocks Daily Discussion &amp; Options Tradin...,https://www.reddit.com/r/stocks/comments/mmo2g...,2021-04-08,


In [56]:
df_stocks.to_csv('daily_disc_raw_stocks.csv')

In [None]:
#collect comments using praw
comments_by_day_stocks_all=[]
for url in df_stocks_all['url'].tolist():
    submission = reddit_api.submission(url=url)
    submission.comments.replace_more(limit=0)
    comments=list([(comment.body) for comment in submission.comments])
    comments_by_day_stocks_all.append(comments)

In [None]:
df_stocks_all = df_stocks_all.drop_duplicates(subset=['date'])

In [None]:
df_stocks_all

In [None]:
df_stocks_all.to_csv('daily_disc_raw_stocks_all.csv')

In [None]:
# Save data in a format that works for lists
with open('listfile_stocks_all.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(comments_by_day_stocks_all, filehandle)

Sources: 

https://www.reddit.com/dev/api/

https://praw.readthedocs.io/en/latest/index.html

https://medium.com/analytics-vidhya/sentiment-analysis-for-trading-with-reddit-text-data-73729c931d01

https://stackabuse.com/reading-and-writing-lists-to-a-file-in-python/

https://stackoverflow.com/questions/20490274/how-to-reset-index-in-a-pandas-dataframe