In [38]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import requests
import bs4
from bs4 import BeautifulSoup
import sys
sys.path.append('../')
from utils import twitter_auth, symbols
import json
import numpy as np
import twitter
import time

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Prereqs
- Make sure to pip install python-twitter if using 'twitter' module

## Twitter scraping

- [Searching tweets](https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets)
    - [filters and such](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/guides/standard-operators)
- [Consider using tweepy](https://towardsdatascience.com/tweepy-for-beginners-24baf21f2c25)
- [python-twitter module (currently being used)](https://python-twitter.readthedocs.io/en/latest/_modules/twitter/api.html?highlight=getsearch)

# Sentiment Analysis - Twitter Pipeline
- Note before I forget: also want to pull from r/WallStreetBets (seriously)
- Map all tickers to their names
- Decide what parts of json to keep
- Temporal organization
    - Need to know how often to space queries
- Special chars? e.g. \xa0
- Built in sentiment with API: ':(' or ':)' search terms

- Clustering users based on their background and interests

# Filtering "noise"
- Create a list of important users and their relationship to whatever company
    - CEOs and news outlets
- Weighting:
   - for users (by follower count, whether they are in above list, avg retweet, sentiment score?)
   - for words/terms (emojis probably low weighting)

# Some case studies

- How to know some tweets are irrelevant to price?
    - e.g. @elonmusk: 'Thanks to everyone who worked so hard to make Tesla successful. My heart goes out to you.' Has positive sentiment but is irrelevant

## Other resources
- Tweet scraping libraries: https://towardsdatascience.com/how-to-scrape-tweets-from-twitter-59287e20f0f1
    - https://medium.com/@jayeshsrivastava470/how-to-extract-tweets-from-twitter-in-python-47dd07f4e8e7
- Sentiment: 
    - https://monkeylearn.com/blog/sentiment-analysis-of-twitter/#:~:text=Try%20MonkeyLearn-,Twitter%20Sentiment%20Analysis%20with%20Machine%20Learning,are%20talking%20about%20their%20brand.
    -  https://pythonprogramming.net/sentiment-analysis-module-nltk-tutorial/
    - Applying the above [here](https://towardsdatascience.com/basic-binary-sentiment-analysis-using-nltk-c94ba17ae386)

## initializations + functions 

In [64]:
#auth_, url_rest_ = twitter_auth.connect_old()
api = twitter_auth.connect()

def tweet_query_old(query, filters = ['retweets', 'replies'], count = 100, lang = 'en', result_type = 'popular'):
    """Query: just the search terms using OAuth1 and requests"""
    if filters is not None:
        filter_ = ' '
        for f in filters:
            filter_ += '-filter:%s '%f
        query = query + filter_[:-1]
    
    params = {'q': query, 'count': count, 'lang': lang,  'result_type': result_type}
    results = requests.get(url_rest_, params=params, auth=auth_)
    return results.json()

#usage:
#tweets = tweet_query_old('TSLA')
#messages = [BeautifulSoup(tweet['text'], 'html5lib').get_text() for tweet in tweets['statuses']]

def tweet_query(query, filters = ['retweets', 'replies'], count = 100, lang = 'en', result_type = 'popular', max_id = None):
    """Query: just the search terms"""
    if filters is not None:
        filter_ = ' '
        for f in filters:
            filter_ += '-filter:%s '%f
        query = query + filter_[:-1]
        
    results = api.GetSearch(term=query, result_type=result_type, max_id=max_id, count=count, lang=lang)
    tweets = [] #list of dictionaries
    for twt in results:
        curr_json = json.loads(str(twt))
        # Some keys are nonexistent if they are empty (e.g. if no profile description); pretty sure this is python-twitter's fault
        # --> try/except block overcomes this
        target_keys = ['id', 'full_text','created_at','favorite_count', 'retweet_count']
        target_dict = dict.fromkeys(target_keys)
        target_dict['user'] = dict.fromkeys(['id', 'name', 'description', 'followers_count', 'statuses_count', 'verified', 'created_at']) #sub dict
        target_keys += [['user','id'], ['user','name'], ['user','description'], ['user','followers_count'], ['user','statuses_count'], ['user','verified'], ['user','created_at']] #adding 'user' keys because target_keys originally was used to construct a dict
        for k in target_keys:
            try:
                if type(k) is str:
                    target_dict[k] = curr_json[k]
                else: #2d case
                    k1, k2 = k[0], k[1]
                    target_dict[k1][k2] = curr_json[k1][k2]
            except KeyError:
                pass
        tweets.append(target_dict)

    return tweets

In [72]:
tq = tweet_query('tesla', filters = None, result_type = 'recent')

## Mass querying
For now, just querying 7 day data for each ticker
- To go back in time, find minimum status id, then use that as the next max_id

limit: 450/ 15 min = 30/min so maybe get 30 (this takes ~ .535*30 = 16 seconds) then `time.sleep(45) `

Ticker data of format
`{Ticker: {name: '', management: [], sector: ''}`

result_type: recent, popular, or mixed? <br>
When querying for name in a tweet just use first two words

In [96]:
#Getting tweets up until 7 day limit
query = 'tesla'
max_id = None
megalist = []
go = True
while go:
    try:
        for i in range(30):
            q_curr = tweet_query(query, filters = None, result_type = 'popular', max_id = max_id)
            megalist.append(q_curr)
            max_id = min(q['id'] for q in q_curr)
            time.sleep(45)
    except twitter.TwitterError: #either rate limit or 
        go = False
        last_max_id = max_id
        

In [None]:
#doing the same but for every ticker 
tickers = symbols.gen_symbol_dict()
for t in tickers.keys():
    _name = tickers[t]['name'].split(' ')[:2]
    name = ' '.join(_name)
    #tweet_query(t OR  name)

In [8]:
tweets

{'statuses': [{'created_at': 'Sun Dec 20 16:24:15 +0000 2020',
   'id': 1340694515074691077,
   'id_str': '1340694515074691077',
   'text': 'When @elonmusk puts the $TSLA balance sheet into #Bitcoin, we\'ll have to change the BTC rallying cry from "to the moon!" to "to Mars!"',
   'truncated': False,
   'entities': {'hashtags': [{'text': 'Bitcoin', 'indices': [49, 57]}],
    'symbols': [{'text': 'TSLA', 'indices': [24, 29]}],
    'user_mentions': [{'screen_name': 'elonmusk',
      'name': 'Elon Musk',
      'id': 44196397,
      'id_str': '44196397',
      'indices': [5, 14]}],
    'urls': []},
   'metadata': {'result_type': 'popular', 'iso_language_code': 'en'},
   'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
   'in_reply_to_status_id': None,
   'in_reply_to_status_id_str': None,
   'in_reply_to_user_id': None,
   'in_reply_to_user_id_str': None,
   'in_reply_to_screen_name': None,
   'user': {'id': 24222556,
    'id_str': '24222556',
    'name'