In [1]:
'''Download and analyze Twitter data stored in MongoDB
'''

# standard library
import configparser
from datetime import date, timedelta
import os

# non-standard library
import numpy as np
import pandas as pd
from pymongo import MongoClient
import pytz

## Mongo Functions

### Two ways:
1. ```try...finally```
2. Context Manager

In [2]:
# Without using a context manager
def download_tweets_by_hashtag_nonpythonic(hashtag):
    '''Connect to MongoDB, download tweets with param hashtags
    
    Args:
        * hashtag - text hashtag
        
    Returns
        * list of Tweets containing hashtag
    '''
    
    tweets = []
    try:
        client = MongoClient(mlab_uri)
        db = client.get_default_database()
        coll = db[collection]
        tweets = coll.find({"entities.hashtags.text":f"{hashtag}"})
    finally:
        client.close()
        return tweets

In [3]:
class MongoCollection(object):
    '''Connect to mongodb and return collection within context manager
    
    http://book.pythontips.com/en/latest/context_managers.html
    '''
    
    def __init__(self, uri, collection):
        self.client = MongoClient(uri)
        self.db = self.client.get_default_database()
        self.collection = self.db[collection]
        
    def __enter__(self):
        return self.collection
    
    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.client.close()

In [4]:
def download_tweets_by_hashtag(hashtag):
    '''Connect to MongoDB, download tweets with param hashtags
    
    Args:
        * hashtag - text hashtag
        
    Returns
        * list of Tweets containing hashtag
    '''
    with MongoCollection(mlab_uri, collection) as coll:
        tweets = coll.find({'entities.hashtags.text':f'{hashtag}'})
    
    return list(tweets)

## Get data from Mongo

In [5]:
# get mongodb params (using configparser)
config = configparser.ConfigParser()
config_file = 'settings.cfg'
config.read(config_file)
mlab_uri = config.get('MongoDB', 'mlab_uri')
collection = config.get('MongoDB', 'mlab_collection')

In [6]:
# download tweets
tweets = download_tweets_by_hashtag('codeeveryday')

In [7]:
for tweet in tweets:
    print(tweet['text'])

Spent around 30 minutes working on my side project this morning. Not sure if I'll start a commit streak, but will #codeeveryday #programming
Finished an analysis of historical Premier League results. Blog post coming shortly #codeeveryday
Got into the office early to work on the side project. Starbucks after work to finish off today's task #codeeveryday
Early to bed, early to rise. Refactored my analysis to use a #postgres backend, set up as a script. And done! Next project! #codeeveryday
Little late in the day, but was able to fit some time in for the side project. #codeeveryday
Early morning bash scripting. #!/bin/bash/codeeveryday #codeeveryday #programming #unix #linux #macos #chmod #chmod755
Finished up the project I started yesterday. Productive weekend. #codeeveryday
Finished a blog draft. Does Markdown count? I say yes. Any kind of progress on a side project, blogs included. #codeeveryday
Another day working on the blog. #codeeveryday
Playing around on #Bluemix, trying to figur

In [8]:
tweets[0]

{'_id': ObjectId('58d92c3e598402000101e989'),
 'contributors': None,
 'coordinates': None,
 'created_at': 'Mon Mar 27 13:07:10 +0000 2017',
 'entities': {'hashtags': [{'indices': [114, 127], 'text': 'codeeveryday'},
   {'indices': [128, 140], 'text': 'programming'}],
  'symbols': [],
  'urls': [],
  'user_mentions': []},
 'favorite_count': 1,
 'favorited': False,
 'geo': None,
 'id': 846347868734722049,
 'id_str': '846347868734722049',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'is_quote_status': False,
 'lang': 'en',
 'place': None,
 'retweet_count': 0,
 'retweeted': False,
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'text': "Spent around 30 minutes working on my side project this morning. Not sure if I'll start a commit streak, but will #codeeveryday #programming",
 'truncated': False,
 'user': {'contributors

## Clean data and throw into Pandas DataFrame

In [9]:
central = pytz.timezone('US/Central')

In [10]:
cleaned_tweets = []

# go through tweets, extract (and transform) fields into useable format
for tweet in tweets:
    item = {}
    item['text'] = tweet['text']
    
    # convert from UTC to CST and drop timezone as it is not relevant anymore
    tweet_time = pd.to_datetime(tweet['created_at'])
    utc_dt = pytz.utc.localize(tweet_time)
    central_dt = utc_dt.astimezone(central).to_pydatetime()
    tweet_time = central_dt.replace(tzinfo=None)
    tweet_date = tweet_time.replace(hour=0, minute=0, second=0)
    
    item['created_at'] = tweet_time
    item['date'] = central_dt.date()
    item['time'] = central_dt.time()
    item['timedelta'] = tweet_time - tweet_date
    item['count'] = 1 ## needed?
    
    cleaned_tweets.append(item)

In [11]:
tweets_df = pd.DataFrame(cleaned_tweets).sort_values('date')

In [12]:
tweets_df.head(4)

Unnamed: 0,count,created_at,date,text,time,timedelta
0,1,2017-03-27 08:07:10,2017-03-27,Spent around 30 minutes working on my side pro...,08:07:10,08:07:10
1,1,2017-03-28 07:51:32,2017-03-28,Finished an analysis of historical Premier Lea...,07:51:32,07:51:32
2,1,2017-03-29 08:47:37,2017-03-29,Got into the office early to work on the side ...,08:47:37,08:47:37
3,1,2017-03-30 06:11:29,2017-03-30,"Early to bed, early to rise. Refactored my ana...",06:11:29,06:11:29


## Calculate Stats

In [13]:
def on_streak(df, date_to_check):
    '''Check to see if we tweeted hashtag on that day
    
    Args:
        * df with dateindex
        * day to check
        
    Returns:
        Boolean
    '''
    
    if date_to_check in df.index:
        return df.loc[date_to_check].indicator == 1
    
    return False

In [14]:
# get counts for each day, set date as index
counts_df = pd.DataFrame(tweets_df[['created_at', 'count']])
counts_df = counts_df.set_index('created_at')
counts_df.index.to_datetime

# sum up each day's count and fill days with no activity with 0
counts_df = counts_df.resample('D').sum().fillna(0)
counts_df = pd.DataFrame(np.where(counts_df['count'] > 0, 1, 0),
                         index=counts_df.index,
                         columns=['indicator'])

# find streaks
counts_df['block'] = ((counts_df != counts_df.shift(1))
                          .astype(int)
                          .cumsum())

# get length of streaks
streak_lengths = (counts_df
                      .reset_index()
                      .groupby(['indicator', 'block'])['created_at']
                      .apply(np.size))

# pull the level where indicator=1
longest_streak = streak_lengths[1].max()
num_days_with_tweet = streak_lengths[1].sum()
total_tweets = len(tweets_df)

In [15]:
# current streak
last_day_of_streak = None
tweet_to_continue_streak = False

# check if we are currently in the middle of a streak
today = date.today()
if on_streak(counts_df, today):
    last_day_of_streak = date.today()

# since tweets are downloaded every 12 hours, we might not have today's info
# check if we were on a streak yesterday
if last_day_of_streak == None:
    yesterday = (date.today() - timedelta(days=1))

    if on_streak(counts_df, yesterday):
        last_day_of_streak = yesterday
        tweet_to_continue_streak = True  # remind user to tweet to continue streak!
    
# if we are currently streaking, count 
curr_streak = 0
if last_day_of_streak:
    day_to_check = last_day_of_streak
    while on_streak(counts_df, day_to_check):
        curr_streak += 1
        day_to_check -= timedelta(days=1) # go back a day

In [16]:
print('Tweet Time Stats')
print('----------------')
tweets_df['timedelta'].describe()

Tweet Time Stats
----------------


count                        36
mean     0 days 16:57:48.583333
std      0 days 06:08:23.534578
min             0 days 06:11:29
25%      0 days 10:14:08.500000
50%      0 days 20:37:23.500000
75%             0 days 21:53:44
max             0 days 23:47:08
Name: timedelta, dtype: object

In [17]:
print('Tweet Date Results')
print('------------------')
print(f'Total tweets referencing hashtag: {total_tweets}')
print(f'Number of days hashtag was used: {num_days_with_tweet}')
print(f'Most consecutive days using hastag: {longest_streak}')
print(f'Current Streak: {curr_streak}')
print(f'Need to tweet to continue streak? {tweet_to_continue_streak}')

Tweet Date Results
------------------
Total tweets referencing hashtag: 36
Number of days hashtag was used: 36
Most consecutive days using hastag: 18
Current Streak: 18
Need to tweet to continue streak? True


In [18]:
## TODO:
# use itertools to count longest streak. FUN EXERCISE
# http://stackoverflow.com/questions/24342047/count-consecutive-occurences-of-values-varying-in-length-in-a-numpy-array

# do a for from today/yesterday to get current streak