In [None]:
!pip install python-twitter python-dotenv

In [158]:
import os
import gzip
import copy
import json
import time
import urllib
import twitter
import itertools
import datetime
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

t = twitter.Api(consumer_key=os.getenv('CONSUMER_KEY'),
                consumer_secret=os.getenv('CONSUMER_SECRET'),
                access_token_key=os.getenv('ACCESS_TOKEN'),
                access_token_secret=os.getenv('ACCESS_SECRET'),
                tweet_mode='extended')

def remove_media(tw):
    tw = copy.copy(tw)
    if tw.media:
        text = tw.full_text.strip()
        for m in tw.media:
            text = text.replace(m.url, '', 1)
        tw.full_text = text.strip()
    return tw

def remove_starting_mentions(tw):
    tw = copy.copy(tw)
    text = tw.full_text.strip()
    for user in set([u.screen_name for u in tw.user_mentions]):
        text = text.replace(f'@{user}', '', 1)
    tw.full_text = text.strip()
    return tw

def format_media(m):
    if m.type in ['video', 'animated_gif']:
        max_bitrate = max(m.video_info['variants'], key=lambda x: x.get('bitrate', 0))
        return {
            'type': 'video',
            'url': max_bitrate['url'],
            'content-type': max_bitrate['content_type'],
        }
    elif m.type == 'photo':
        return {
            'type': 'image',
            'url': m.media_url_https,
        }
    return {}

def parse_date(dt):
    return datetime.datetime.strptime(dt, '%a %b %d %H:%M:%S %z %Y')

In [2]:
tweets = {}

# https://twitter.com/ohbmequinox/status/1240818088376823809?s=21

first_tweet = 1240818088376823809
params      = {
    'include_rts': False,
    'count': 5000,
    'since_id': first_tweet-1,
}

fetching= True
while fetching:
    fetching = t.GetUserTimeline(screen_name='ohbmequinox', **params)
    tweets.update({tt.id: tt for tt in fetching})
    if not fetching or first_tweet in [tt.id for tt in fetching]:
        break
    params = {'max_id': fetching[-1].id}
    
# https://twitter.com/AskDrJeg/status/1240855036873211904
wrong_status = t.GetStatus(1240855036873211904)
wrong_status.full_text = wrong_status.full_text.replace('@OHBMequinoX ', '', 1)
wrong_status.user_mentions = wrong_status.user_mentions[1:]
wrong_status.user = list(tweets.values())[0].user
tweets.update({wrong_status.id: wrong_status})
del tweets[1240854713781764101]

# https://twitter.com/OHBMequinoX/status/1240823872976379904
del tweets[1240823872976379904]

In [149]:
originals = {twid: tw for twid, tw in tweets.items() if tw.retweeted_status is None}

timeline = {
    twid: {
        'date': parse_date(tw.created_at),
        'text': tw.full_text.strip(),
        'ids': [twid],
        'mentions': [u.id for u in tw.user_mentions],
        'media': [format_media(m) for m in (tw.media or [])]
    }
    for twid, tw in originals.items()
    if tw.in_reply_to_status_id is None
}

originals_replies = {
    twid: tw for twid, tw in originals.items()
    if tw.in_reply_to_status_id and tw.in_reply_to_screen_name.lower() in ['ohbmequinox']
}

In [150]:
reply_to = {twid: twid for twid in timeline}

while originals_replies:

    for twid, tw in list(originals_replies.items()):
        
        twid_reply = tw.in_reply_to_status_id

        if twid_reply not in timeline and twid_reply not in reply_to:
            continue
            
        if twid_reply in reply_to:
            twid_reply = reply_to[twid_reply]
            
        reply_to[twid] = twid_reply

        original_text = timeline[twid_reply]['text']
        tw = remove_starting_mentions(remove_media(tw))
        text = tw.full_text

        for ellipsis in ['...', '…']:
            if text.startswith(ellipsis):
                text = text.replace(ellipsis, '', 1)
            if original_text.endswith(ellipsis):
                original_text = ellipsis.join(original_text.split(ellipsis)[:-1])

        timeline[twid_reply]['text'] = f'{original_text} {text}'
        
        if twid_reply == 1240888314137202689:
            timeline[twid_reply]['text'] = timeline[twid_reply]['text'].replace('1/2', '').replace('2/2', '')
        
        timeline[twid_reply]['ids'] += [twid]
        timeline[twid_reply]['mentions'] += [u.id for u in tw.user_mentions]
        timeline[twid_reply]['media'] += [format_media(m) for m in (tw.media or [])]
        
        del originals_replies[twid]

In [151]:
df = pd.DataFrame([
    {
        'id': twid,
        'date': tw['date'],
        'text': tw['text'],
        'session': tw['text'].split('\n')[0],
    }
    for twid, tw in timeline.items()
    if tw['text'].startswith('#OHBMx-')
]).sort_values(by='date')

In [152]:
df.to_json('presentations.json', orient='records')

In [153]:
threads = df[(df.session.str.contains('#keynote') | df.session.str.contains('#talk'))]

In [126]:
def get_replies(tweets, mentions):
    users = set([tw.user.screen_name for tw in tweets])
    tweet_ids = [tweet.id for tweet in tweets]
    
    if 1240855036873211904 in tweet_ids:
        users |= set(['AskDrJeganathan'])
    
    max_id = None
    while True:
        try:
            replies = t.GetSearch(
                term=" OR ".join([f'to:@{user}' for user in users]),
                since_id=min(tweet_ids),
                max_id=max_id,
                count=100,
            )
        except twitter.error.TwitterError as e:
            print("Waiting")
            time.sleep(60)
            continue
            
        replied_tweets = []
        for reply in replies:
            max_id = reply.id
            
            if reply.id in tweet_ids:
                continue
                
            if reply.in_reply_to_status_id not in tweet_ids:
                continue
            
            if reply.user.id not in mentions[reply.in_reply_to_status_id]:
                continue
                
            mentions.update({
                reply.id: mentions[reply.in_reply_to_status_id]
            })
                
            yield reply
            yield from get_replies([reply], mentions)
            
        if len(replies) != 100:
            break

In [127]:
all_tweets_from_threads = []
mentions_for_tweets_from_threads = {}
for twid in threads.id.tolist():
    all_tweets_from_threads += timeline[twid]['ids']
    for subtwid in timeline[twid]['ids']:
        mentions_for_tweets_from_threads[subtwid] = timeline[twid]['mentions']
        
replies = list(get_replies([tweets[twid] for twid in all_tweets_from_threads], mentions_for_tweets_from_threads))

Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting


In [154]:
def unthread(tweets, original_tweets, replies):
    thread_replies = [
        tw for tw in replies
        if tw.in_reply_to_status_id in original_tweets and tw.id not in original_tweets
    ]
    if thread_replies:
        yield from thread_replies
        yield from unthread(tweets, [r.id for r in thread_replies], replies)
        
users = {}

for twid in timeline:
    if twid not in all_tweets_from_threads:
        continue
        
    timeline[twid]['thread'] = []
    for rr in sorted(unthread(tweets, timeline[twid]['ids'], replies), key=lambda tw: (parse_date(tw.created_at))):
        users[rr.user.screen_name] = rr.user
        timeline[twid]['thread'] += [{
            'id': str(rr.id),
            'user': rr.user.screen_name,
            'date': parse_date(rr.created_at),
            'text': remove_starting_mentions(remove_media(rr)).full_text,
            'media': [format_media(m) for m in (rr.media or [])],
        }]

In [155]:
data = { **{twid: tw for twid, tw in tweets.items()}, **{r.id: r for r in replies} }

In [164]:
with gzip.open('data.json.gz', 'wt', encoding="ascii") as f:
   json.dump({
       'tweets': sorted([d.AsDict() for d in data.values()], key=lambda t: parse_date(t['created_at']))
   }, f)

In [165]:
def dtconverter(o):
    if isinstance(o, datetime.datetime):
        return o.__str__()
 
with open('timeline.json', 'w') as f:
    json.dump(
        {
            'users': {
                u.screen_name: {
                    'id': str(u.id),
                    'screen_name': u.screen_name,
                    'name': u.name,
                    'avatar': u.profile_image_url_https,
                    'description': u.description
                }
                for u in users.values()
            },
            'timeline': sorted([
                {
                    'date': t['date'],
                    'text': t['text'],
                    'media': t['media'],
                    'ids': [str(twid) for twid in t['ids']],
                    'thread': t.get('thread', [])
                }
                for t in timeline.values()
            ], key=lambda t: t['date'])
        },
        f,
        default=dtconverter
    )