In [32]:
import requests
import urllib.request
import time, json, os, traceback
from json import JSONDecodeError
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
from collections import deque

class StockTwitsAPIScraper:
    def __init__(self, symbol, date, maxId):
        self.symbol = symbol
        self.link = "https://api.stocktwits.com/api/2/streams/symbol/{}.json?".format(symbol)
        self.targetDate = date
        self.tweets = []
        self.reqeustQueue = deque()
        self.maxId = maxId
        self.initDir()

    def setLimits(self, size, duration):
        self.size = size
        self.duration = duration
        self.requestInterval = duration // size + 1 if duration % size else duration // size

    # create directions if they don't exist
    def initDir(self):
        if not os.path.isdir("stocks"):
            os.mkdir("stocks")
        if not os.path.isdir("stocks/{}".format(self.symbol)):
            os.mkdir("stocks/{}".format(self.symbol))

    # write tweets we get and the ID of the last tweet in case system break down
    def writeJson(self):
        if self.tweets:
            self.maxId = self.tweets[-1]["post_id"] #原本是id
            fileName = "stocks/{}/{}.json".format(self.symbol, self.maxId)
            with open(fileName, "w") as f:
                json.dump(self.tweets, f)
    
    def getCurrentUrl(self):
        return self.link + "max={}".format(self.maxId)

    # request manager
    # can't exceed 200 requests within an hour
    def requestManager(self):
        if len(self.reqeustQueue) == self.size:
            now = datetime.now()
            firstRequest = self.reqeustQueue.popleft()
            if now < firstRequest + timedelta(seconds=self.duration):
                timeDiff = firstRequest - now
                waitTime = timeDiff.total_seconds() + 1 + self.duration                
                print("Reach request limit, wait for {} seconds.".format(waitTime))
                sleep(waitTime)

    def getMessages(self, url):
        self.requestManager()

        response = requests.get(url)
        self.reqeustQueue.append(datetime.now())
        try:
            data = json.loads(response.text)
        except JSONDecodeError:
            if "Bad Gateway" in response.text:
                print("Just a Bad Gateway, wait for 1 minute.")
                sleep(60)
                return True
            print(len(self.reqeustQueue))
            print(self.reqeustQueue[0], datetime.now())
            print(url)
            print(response.text)
            print(traceback.format_exc())
            raise Exception("Something worong with the response.")
        if data and data["response"]["status"] == 200:
            data["cursor"]["max"]
            for m in data["messages"]:
                record = {}            
                createdAt = datetime.strptime(m["created_at"], "%Y-%m-%dT%H:%M:%SZ")
                if createdAt < self.targetDate:
                    return False
                record["post_id"] = m["id"]
                record["text"] = m["body"]
                record["post_time"] = m["created_at"]
#                 record["time"] = createdAt.timestamp() # 改为po贴时间
                record["sentiment"] = m["entities"]["sentiment"]["basic"] if m["entities"]["sentiment"] else "N/A"
#                 self.tweets.append(record)
#                more info:
#                 record['user_id'] = m["user"]
                record['user_id'] = m["user"]["id"]
                record["user_name"] = m["user"]["username"]
                record['user_identity'] = m["user"]["identity"] # if user is an official account
                record['num_followers'] = m["user"]["followers"] # 用户粉丝数
                record['num_following'] = m["user"]["following"] # 用户关注的人数
                record['official_acct'] = m["user"]["official"]
                record['join_date'] = m["user"]["join_date"]
                record['num_likes'] = m["user"]["like_count"]
                record['num_tweets'] = m["user"]["ideas"]
                self.tweets.append(record)
        else:
            print(response.text)        
        return True

    def getTweetsAndWriteToFile(self):        
        if not self.getMessages(self.getCurrentUrl()):
            return False
        self.writeJson()
        print("Scrap {} tweets starting from {}.".format(len(self.tweets), self.maxId))
        self.tweets.clear()
        sleep(self.requestInterval)
        return True

    def scrapTweets(self):        
        try:
            doScrap = True
            while doScrap:
                doScrap = self.getTweetsAndWriteToFile()
        except Exception:
            print(traceback.format_exc())

symbol = input("Enter stock symbol: ")
print("This scraper scraps tweets backward.\n\
The ID you put in belongs the most recent tweet you're goint go scrap.\n\
And the scraper will keep going backward to scrap older tweets.")
maxId = input("Enter the starting tweet ID: ") # 在user_name时间的位置上
targetDate = input("Enter the earlest date (mmddyyyy): ") # 直接更改为具体时间
print("You can only send 200 requests to StockTwits in an hour.")
requestLimit = input("Enter the limit of number of requests within an hour: ")

scraper = StockTwitsAPIScraper(symbol, datetime.strptime(targetDate, "%m%d%Y"), int(maxId))
scraper.setLimits(int(requestLimit), 3600)
scraper.scrapTweets()

Enter stock symbol: JPM
This scraper scraps tweets backward.
The ID you put in belongs the most recent tweet you're goint go scrap.
And the scraper will keep going backward to scrap older tweets.
Enter the starting tweet ID: 457437399
Enter the earlest date (mmddyyyy): 05052022
You can only send 200 requests to StockTwits in an hour.
Enter the limit of number of requests within an hour: 200
Scrap 30 tweets starting from 457190287.
Scrap 30 tweets starting from 457052349.
Scrap 30 tweets starting from 456945481.


In [28]:
m = {"id":456666183,
     "body":"$DIS $WMT $AAL $JPM $MSFT Disney, Walmart, big companies silent on high court’s abortion draft\n\nhttps://news.alertsandnews.com/disney-walmart-big-companies-silent-on-high-courts-abortion-draft/",
     "created_at":"2022-05-04T03:08:42Z",
     "user":{"id":689912,
             "username":"AlertsAndNews",
             "name":"AlertsAndNews",
             "avatar_url":"https://avatars.stocktwits.com/production/689912/thumb-1646712583.png",
             "avatar_url_ssl":"https://avatars.stocktwits.com/production/689912/thumb-1646712583.png",
             "join_date":"2016-02-17",
             "official":'false',
             "identity":"User",
             "classification":[],
             "followers":17227,
             "following":137,
             "ideas":22815,"watchlist_stocks_count":1,
             "like_count":2299,"plus_tier":"",
             "premium_room":"","trade_app":'false',"trade_status":'null'}}

In [33]:
import pandas as pd

jpm_df = pd.read_json('stocks/JPM/457190287.json')
jpm_df

Unnamed: 0,post_id,text,post_time,sentiment,user_id,user_name,user_identity,num_followers,num_following,official_acct,join_date,num_likes,num_tweets
0,457426082,Liquidation time 🩸🩸🩸🩸🚨🚨🚨🚨 $gs $bac $jpm huge s...,2022-05-06 18:12:22+00:00,Bearish,6573190,WSKING911,User,2,0,False,2022-04-25,0,343
1,457425093,Plummet mode activated $gs $bac $jpm huge sell...,2022-05-06 18:09:28+00:00,Bearish,6573190,WSKING911,User,2,0,False,2022-04-25,0,343
2,457422558,"$JPM $25,000 a day keeps the 9 to 5 away; For ...",2022-05-06 18:02:04+00:00,Bullish,6254587,StuartSparks,User,0,0,False,2022-01-14,0,15
3,457421905,$gs $bac $jpm huge sell off signal detected by...,2022-05-06 18:00:02+00:00,Bearish,6573190,WSKING911,User,2,0,False,2022-04-25,0,343
4,457413127,How does this affect your portfolio? $JPM in U...,2022-05-06 17:29:39+00:00,,967528,tickeron,Official,10081,108,True,2017-02-23,0,896812
5,457393277,$JPM Form 424B2 (prospectus [rule 424(b)(2)]) ...,2022-05-06 16:24:10+00:00,,2762379,Newsfilter,User,13272,17,False,2019-09-01,540,1643788
6,457389006,@SweepCast Unusual Options Observed : $JPM wit...,2022-05-06 16:10:50+00:00,,3434084,SweepCast,User,13323,28,False,2020-05-10,314,485145
7,457384354,$gs $bac $jpm gonna tank big today 🚨🚨🚨🚨🩸🩸🩸,2022-05-06 15:56:45+00:00,Bearish,6573190,WSKING911,User,2,0,False,2022-04-25,0,343
8,457382287,$gs $bac $jpm huge sell off signal detected by...,2022-05-06 15:50:16+00:00,Bearish,6573190,WSKING911,User,2,0,False,2022-04-25,0,327
9,457374276,$JPM Form 424B2 (prospectus [rule 424(b)(2)]) ...,2022-05-06 15:26:16+00:00,,2762379,Newsfilter,User,13272,17,False,2019-09-01,540,1643655
