In [52]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import requests
import bs4
from bs4 import BeautifulSoup
import sys
sys.path.append('../')
from utils import twitter_auth
import json
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Prereqs
- Make sure to pip install python-twitter if using 'twitter' module

## Twitter scraping

- [Searching tweets](https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets)
    - [filters and such](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/guides/standard-operators)
- [Consider using tweepy](https://towardsdatascience.com/tweepy-for-beginners-24baf21f2c25)

# Sentiment Analysis - Twitter Pipeline
- Map all tickers to their names
- Decide what parts of json to keep
- Temporal organization
    - Need to know how often to space queries
- Special chars? e.g. \xa0
- Built in sentiment with API: ':(' or ':)' search terms
- Weighting:
   - for users (by follower count, avg retweet, sentiment score?
   - for words/terms (emojis probably low weighting)
- Clustering users based on their background and interests

# Some case studies

- How to know some tweets are irrelevant to price?
    - e.g. @elonmusk: 'Thanks to everyone who worked so hard to make Tesla successful. My heart goes out to you.' Has positive sentiment but is irrelevant

## Other resources
- Tweet scraping libraries: https://towardsdatascience.com/how-to-scrape-tweets-from-twitter-59287e20f0f1
    - https://medium.com/@jayeshsrivastava470/how-to-extract-tweets-from-twitter-in-python-47dd07f4e8e7
- Sentiment: 
    - https://monkeylearn.com/blog/sentiment-analysis-of-twitter/#:~:text=Try%20MonkeyLearn-,Twitter%20Sentiment%20Analysis%20with%20Machine%20Learning,are%20talking%20about%20their%20brand.
    -  https://pythonprogramming.net/sentiment-analysis-module-nltk-tutorial/
    - Applying the above [here](https://towardsdatascience.com/basic-binary-sentiment-analysis-using-nltk-c94ba17ae386)

## initializations + functions 

In [123]:
#auth_, url_rest_ = twitter_auth.connect_old()
api = twitter_auth.connect()

def tweet_query_old(query, filters = ['retweets', 'replies'], count = 100, lang = 'en', result_type = 'popular'):
    """Query: just the search terms using OAuth1 and requests"""
    if filters is not None:
        filter_ = ' '
        for f in filters:
            filter_ += '-filter:%s '%f
        query = query + filter_[:-1]
    
    params = {'q': query, 'count': count, 'lang': lang,  'result_type': result_type}
    results = requests.get(url_rest_, params=params, auth=auth_)
    return results.json()

#usage:
#tweets = tweet_query_old('TSLA')
#messages = [BeautifulSoup(tweet['text'], 'html5lib').get_text() for tweet in tweets['statuses']]

def tweet_query(query, filters = ['retweets', 'replies'], count = 100, lang = 'en', result_type = 'popular'):
    """Query: just the search terms"""
    if filters is not None:
        filter_ = ' '
        for f in filters:
            filter_ += '-filter:%s '%f
        query = query + filter_[:-1]
        
    results = api.GetSearch(term=query, result_type = result_type, count=count, lang=lang)
    tweets = [] #list of dictionaries
    for twt in results:
        curr_json = json.loads(str(twt))
        # Some keys are nonexistent if they are empty (e.g. if no profile description); pretty sure this is python-twitter's fault
        # --> try/except block overcomes this
        target_keys = ['id_str', 'full_text','created_at','favorite_count', 'retweet_count']
        target_dict = dict.fromkeys(target_keys)
        target_dict['user'] = dict.fromkeys(['id_str', 'name', 'description', 'followers_count', 'statuses_count', 'verified', 'created_at']) #sub dict
        target_keys += [['user','id_str'], ['user','name'], ['user','description'], ['user','followers_count'], ['user','statuses_count'], ['user','verified'], ['user','created_at']] #adding 'user' keys because target_keys originally was used to construct a dict
        for k in target_keys:
            try:
                if type(k) is str:
                    target_dict[k] = curr_json[k]
                else: #2d case
                    k1, k2 = k[0], k[1]
                    target_dict[k1][k2] = curr_json[k1][k2]
            except KeyError:
                pass
        tweets.append(target_dict)

    return tweets

## Tickers and salient details
{Ticker: {name: ~, management:

In [None]:
x = tweet_query('tesla', result_type = 'mixed')

In [126]:
x

[{'id_str': '1341006575650140161',
  'full_text': 'Thanks to everyone who worked so hard to make Tesla successful. My heart goes out to you.',
  'created_at': 'Mon Dec 21 13:04:16 +0000 2020',
  'favorite_count': 326012,
  'retweet_count': 11227,
  'user': {'id_str': '44196397',
   'name': 'Elon Musk',
   'description': None,
   'followers_count': 41028320,
   'statuses_count': 13154,
   'verified': True,
   'created_at': 'Tue Jun 02 20:12:29 +0000 2009'}},
 {'id_str': '1341053729529720833',
  'full_text': 'Send us a all a free Tesla 😂😂 https://t.co/kLRngnj4e1',
  'created_at': 'Mon Dec 21 16:11:39 +0000 2020',
  'favorite_count': 8543,
  'retweet_count': 392,
  'user': {'id_str': '83738272',
   'name': 'Ty Hill',
   'description': 'Proud Father🖤✌🏿 Cheetah Speed ⁉️⁉️⁉️ Coffee Washer SF/PF #77 Part -Time Delta Employee',
   'followers_count': 522919,
   'statuses_count': 12081,
   'verified': True,
   'created_at': 'Tue Oct 20 01:23:36 +0000 2009'}},
 {'id_str': '1340846145929670656',
 

In [63]:
results = api.GetSearch(term="tesla lang:fr ", result_type = 'popular', count=100)
tweets = []
for twt in results:
    tempTweet = (str(twt))
    tweets.append(json.loads(tempTweet)['full_text'])

tweets

["@leroidesrats Tranquille j'ai pris une semaine pour avoir appuyé sur le bouton selfie de mon tel en Tesla je comprends.",
 'Tesla revenue:\n\n2025: $95.1 billion*\n2024: $81.6 billion*\n2023: $71.8 billion*\n2022: $60.6 billion*\n2021: $46.2 billion*\n2020: $30.9 billion*\n2019: $24.6 billion\n2018: $21.5 billion\n2017: $11.8 billion\n2016: \xa0$7.0 billion\n2015: \xa0$4.1 billion\n2014: $3.2 billion\n\n*analyst estimates',
 'Tesla fait son entrée au S&amp;P 500: retour sur la succès story du constructeur de voitures électriques https://t.co/eNRvMnGWX7',
 '3 des grands gagnants de 2020\n\nTesla : + 684%\nZoom : + 490%\nBitcoin : + 220%\n\n#coronavirus',
 "L'iPhone est un téléphone augmenté, la Tesla est une voiture augmentée, le bitcoin est une monnaie augmentée. #Tech"]

In [49]:
json.loads(tempTweet)

{'created_at': 'Mon Dec 14 20:39:20 +0000 2020',
 'favorite_count': 191,
 'full_text': "L'iPhone est un téléphone augmenté, la Tesla est une voiture augmentée, le bitcoin est une monnaie augmentée. #Tech",
 'hashtags': [{'text': 'Tech'}],
 'id': 1338584381846413324,
 'id_str': '1338584381846413324',
 'lang': 'fr',
 'retweet_count': 23,
 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
 'urls': [],
 'user': {'created_at': 'Tue Aug 30 10:10:18 +0000 2011',
  'description': 'Journaliste @LEXPRESS / #Banques #Tech #Bitcoin ✉️ = rbloch@lexpress.fr',
  'favourites_count': 3821,
  'followers_count': 14632,
  'friends_count': 989,
  'id': 364784696,
  'id_str': '364784696',
  'listed_count': 241,
  'name': 'Raphaël Bloch',
  'profile_background_color': 'FFFFFF',
  'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',
  'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',
  'profile_backgroun

In [8]:
tweets

{'statuses': [{'created_at': 'Sun Dec 20 16:24:15 +0000 2020',
   'id': 1340694515074691077,
   'id_str': '1340694515074691077',
   'text': 'When @elonmusk puts the $TSLA balance sheet into #Bitcoin, we\'ll have to change the BTC rallying cry from "to the moon!" to "to Mars!"',
   'truncated': False,
   'entities': {'hashtags': [{'text': 'Bitcoin', 'indices': [49, 57]}],
    'symbols': [{'text': 'TSLA', 'indices': [24, 29]}],
    'user_mentions': [{'screen_name': 'elonmusk',
      'name': 'Elon Musk',
      'id': 44196397,
      'id_str': '44196397',
      'indices': [5, 14]}],
    'urls': []},
   'metadata': {'result_type': 'popular', 'iso_language_code': 'en'},
   'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
   'in_reply_to_status_id': None,
   'in_reply_to_status_id_str': None,
   'in_reply_to_user_id': None,
   'in_reply_to_user_id_str': None,
   'in_reply_to_screen_name': None,
   'user': {'id': 24222556,
    'id_str': '24222556',
    'name'