In [ ]:
# !pip install tweepy
# !pip install seaborn
# !pip install plotly
# !pip install jsonpickle
# !pip install textblob
# !pip install gensim
# !pip install bokeh

In [ ]:
import tweepy
import credentials
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import jsonpickle
from textblob import TextBlob
import os
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk import ngrams, FreqDist
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import string

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [ ]:
auth = tweepy.OAuthHandler(credentials.API_KEY, credentials.API_SECRET_KEY)
auth.set_access_token(credentials.ACCESS_TOKEN, credentials.ACCESS_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

### Helper Functions

In [ ]:
class Tweet():
    def __init__(self, id):
        self.id = id
        self.url = self.get_tweet_url(id)



    def get_tweet_url(self, id):
        return "https://twitter.com/ex/status/" + str(id)



t = Tweet(1252383850086318084)
t.url

'https://twitter.com/ex/status/1252383850086318084'

In [ ]:
def get_tweet_url(id):
    return "https://twitter.com/ex/status/" + str(id)

#get_tweet_url(1252748633767895041)

In [ ]:
def deEmojify(text):
    if text:
        return text.encode('ascii', 'ignore').decode('ascii')
    else:
        return None

### Set Up Search Query

* Tweet mode `extended` enables us to access tweets that contain more than 140 characters

In [ ]:
TRACK_WORD = "bermuda"

SEARCH_QUERY      = TRACK_WORD + " -filter:retweets"
SEARCH_SINCE      = "2020-04-20"
SEARCH_UNTIL      = "2020-04-21"
SEARCH_LANG       = "en"
BATCH_SIZE        = 50 # 100 per request
SEARCH_TWEET_MODE = "extended"

FILE_NAME         = "data/tweets.txt"

FONT_STYLES       = dict(family='Oswald', color='#eeeeee')

COLORS            = ['mediumturquoise'] # https://developer.mozilla.org/en-US/docs/Web/CSS/color_value

In [ ]:
def load_tweets(new_api_request=False):
    """
    Load tweets either via a new API request or from file.
    
    :param new_api_request: bool
    
    :returns: Dataframe
    """

    if new_api_request:
        print("Making an API request...")
        with open(FILE_NAME, 'w') as f:
            try:
                new_tweets = api.search(
                q = SEARCH_QUERY,
                since = SEARCH_SINCE,
                until = SEARCH_UNTIL,
                count = BATCH_SIZE,
                lang = SEARCH_LANG,
                tweet_mode = SEARCH_TWEET_MODE)
                if not new_tweets:
                    print("No tweets found.")
                else:
                    for tweet in new_tweets:
                        f.write(jsonpickle.encode(tweet._json, unpicklable=False) + '\n')
                    print("Downloaded {} tweets.".format(BATCH_SIZE))

            except tweepy.TweepError as e:
                print("Tweepy error: " + str(e))

    if not os.path.exists(FILE_NAME):
        return pd.DataFrame()
    else:
        print("Loading from file...")
        return pd.read_json(path_or_buf=FILE_NAME, lines=True)

In [ ]:
tweets_ = load_tweets()
tweets_.head()

Loading from file...

Converting input from bool to <class 'numpy.uint8'> for compatibility.



Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,metadata,source,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_screen_name,user,geo,coordinates,place,contributors,is_quote_status,retweet_count,favorite_count,favorited,retweeted,lang,extended_entities,possibly_sensitive,quoted_status_id,quoted_status_id_str,quoted_status
0,2020-04-20 23:57:40+00:00,1252385978813001729,1252385978813001728,@Shansdoe Need to take a trip to Bermuda next ...,False,"[10, 50]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",1.252279e+18,1.252279e+18,1642051000.0,1642051000.0,Shansdoe,"{'id': 1170596048, 'id_str': '1170596048', 'na...",,,,,False,0,0,False,False,en,,,,,
1,2020-04-20 23:54:03+00:00,1252385071610257415,1252385071610257408,4/20: 28 days after seeding!! Decided pull som...,False,"[0, 265]","{'hashtags': [{'text': 'weeds', 'indices': [94...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 1077614027847729152, 'id_str': '1077614...",,,,,False,0,3,False,False,en,"{'media': [{'id': 1252385054682046464, 'id_str...",0.0,,,
2,2020-04-20 23:53:13+00:00,1252384859265232901,1252384859265232896,"“Christianize me if you may, but don’t try to ...",False,"[0, 104]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,,"{'id': 111545013, 'id_str': '111545013', 'name...",,,,,False,1,3,False,False,en,,,,,
3,2020-04-20 23:49:12+00:00,1252383850086318084,1252383850086318080,these are so fcking annoying I barely got any ...,False,"[0, 73]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 1714566750, 'id_str': '1714566750', 'na...",,,,,True,0,0,False,False,en,,0.0,1.252054e+18,1.252054e+18,{'created_at': 'Mon Apr 20 01:59:23 +0000 2020...
4,2020-04-20 23:45:11+00:00,1252382838885777411,1252382838885777408,"@Bibiana1Krall @wordpressdotcom Sounds great, ...",False,"[32, 120]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""https://mobile.twitter.com"" rel=""nofo...",1.252347e+18,1.252347e+18,2921017000.0,2921017000.0,Bibiana1Krall,"{'id': 3012717386, 'id_str': '3012717386', 'na...",,,,,False,0,1,False,False,en,"{'media': [{'id': 1252382832917229571, 'id_str...",0.0,,,


In [ ]:
# Reading Directly into a DF

# results = api.search(
#                    q = SEARCH_QUERY,
#                    since = SEARCH_SINCE,
#                    until = SEARCH_UNTIL,
#                    count = SEARCH_COUNT,
#                    lang = SEARCH_LANG,
#                    tweet_mode = SEARCH_TWEET_MODE)

# json_data = [r._json for r in results]
# df = pd.json_normalize(json_data)

### Data Dictionary

* `created_at`: datetime of tweet creation
* `id` and `id_str`: unique identifiers that can be used to generate a URL pointing to the tweet
* `full_text`: content of text (retrieved via `extended` tweet mode)
* `truncated`: not relevant for extended tweet payload as no tweets will have been truncated
* `display_text_range`: delineates sections of tweet text (ex: where does a mention end and the body begin)
* `entities`: a group of fields containing all sub-elements of a tweet and their positions (ex: hashtags used and their start/end indices)
* `source`: origin of tweet in href format (ex: iphone, mobile web, web)
* `in_reply_to`: group of fields relating to the tweet/user being replied to
* `user.x`: group of fields relating to the user who created the tweet

### Cleaning Tweets

* Remove emojis from text
* Convert `created_at` to datetime

In [ ]:
tweets_clean = tweets_.copy()
tweets_clean["full_text"] = tweets_clean["full_text"].apply(deEmojify)
tweets_clean["created_at"] = tweets_clean["created_at"].apply(pd.to_datetime)

### Apply TextBlob for basic sentiment analysis

In [ ]:
tweets_clean["textblob_polarity"] = tweets_clean["full_text"].apply(lambda text: TextBlob(text).sentiment[0])
tweets_clean["textblob_subjectivity"] = tweets_clean["full_text"].apply(lambda text: TextBlob(text).sentiment[1])

In [ ]:
print("Earliest tweet: {} \nMost recent tweet: {}\nTime Elapsed: {}".format(tweets_clean["created_at"].min(), tweets_clean["created_at"].max(), tweets_clean["created_at"].max() - tweets_clean["created_at"].min()))

Earliest tweet: 2020-04-20 22:14:27+00:00 
Most recent tweet: 2020-04-20 23:57:40+00:00
Time Elapsed: 0 days 01:43:13


### Time series visualization

In [ ]:
TIME_WINDOW = "2min"

time_series = tweets_clean[["id", "created_at"]].groupby(pd.Grouper(key="created_at", freq=TIME_WINDOW)).count()
time_series = time_series.reset_index().rename(columns={"id":"count"})

fig = px.line(time_series, x="created_at", y="count", color_discrete_sequence=['greenyellow'])
fig.update_layout(
    title="Tweet Frequency",
    xaxis_title="Time",
    yaxis_title="Count",
    template="plotly_dark",
    font=FONT_STYLES,
    yaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 1
    )
)

fig.show()

### Basic sentiment analysis with TextBlob

In [ ]:
fig = make_subplots(
        rows=1, cols=2,
        column_widths=[1, 1],
        row_heights=[2],
        y_title= "Count"
        )

fig.add_trace(go.Histogram(x=tweets_clean["textblob_polarity"],
    name="Polarity", marker=dict(color = 'aquamarine'), xbins=dict(start=np.min(tweets_clean["textblob_polarity"]), 
    size=0.2, end=np.max(tweets_clean["textblob_polarity"]))
    ), row=1, col=1)

fig.add_trace(go.Histogram(x=tweets_clean["textblob_subjectivity"],
    name="Subjectivity", marker=dict(color = 'lightsalmon'), xbins=dict(start=np.min(tweets_clean["textblob_polarity"]), 
    size=0.1, end=np.max(tweets_clean["textblob_polarity"]))
    ), row=1, col=2)

fig.update_xaxes(title_text="Polarity", row=1, col=1)
fig.update_xaxes(title_text="Subjectivity", row=1, col=2)

fig.update_layout(
    title_text = "Tweet Polarity & Subjectivity Using TextBlob Library",
    template="plotly_dark",
    font=FONT_STYLES
)
fig.show()