In [1]:
from datetime import datetime
import json
import os

import pandas as pd
import numpy as np
import requests
import plotly
import plotly.graph_objs as go
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook
from joblib import Parallel, delayed


os.environ["http_proxy"] = "http://192.168.199.10:11233"
os.environ["https_proxy"] = "http://192.168.199.10:11233"

%matplotlib inline
plotly.offline.init_notebook_mode(connected=True)

In [2]:
%load_ext watermark
%watermark
%watermark -p plotly,tqdm,joblib,pandas,numpy,requests

2018-07-27T16:44:25+08:00

CPython 3.6.5
IPython 6.1.0

compiler   : GCC 7.2.0
system     : Linux
release    : 4.10.0-32-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
plotly 2.5.1
tqdm 4.19.8
joblib 0.11
pandas 0.22.0
numpy 1.14.3
requests 2.18.4


## Read in the json file

In [3]:
with open("../tweets.json") as f:
    tweets = json.load(f)
tweets[:2]

[{'date': '2018-06-21',
  'headline': 'Generative design',
  'no_conversation': False,
  'sub-headline': '',
  'tags': [],
  'tid': '1009921622801776640'},
 {'date': '2018-06-21',
  'headline': 'Visualization',
  'no_conversation': False,
  'sub-headline': 'ImageNet Class Hierarchy',
  'tags': ['dataviz'],
  'tid': '1009875555234140160'}]

In [4]:
for t in tweets:
    t["date"] = datetime.strptime(t["date"], "%Y-%m-%d").date()
tweets[:2]    

[{'date': datetime.date(2018, 6, 21),
  'headline': 'Generative design',
  'no_conversation': False,
  'sub-headline': '',
  'tags': [],
  'tid': '1009921622801776640'},
 {'date': datetime.date(2018, 6, 21),
  'headline': 'Visualization',
  'no_conversation': False,
  'sub-headline': 'ImageNet Class Hierarchy',
  'tags': ['dataviz'],
  'tid': '1009875555234140160'}]

In [5]:
df_tweets = pd.DataFrame(tweets)
print(df_tweets.shape[0])
df_tweets = df_tweets.drop_duplicates("tid")
print(df_tweets.shape[0])
df_tweets.set_index("tid", inplace=True)
df_tweets.head()

1425
1417


Unnamed: 0_level_0,date,headline,no_conversation,sub-headline,tags
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1009921622801776640,2018-06-21,Generative design,False,,[]
1009875555234140160,2018-06-21,Visualization,False,ImageNet Class Hierarchy,[dataviz]
1009865599344754688,2018-06-21,Visualization,False,What Makes People Happy,[dataviz]
1009935836345917441,2018-06-21,Notable Research,False,,[research]
1009791778264936448,2018-06-21,Notable Research,False,,[research]


In [6]:
df_tweets[df_tweets.no_conversation == True].head()

Unnamed: 0_level_0,date,headline,no_conversation,sub-headline,tags
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001224559595409409,2018-05-28,On Data,True,,[]
1001232275005095936,2018-05-28,On Data,True,,[]
1001232275697123328,2018-05-28,On Data,True,,[]
1001232276569522177,2018-05-28,On Data,True,,[]
1019087310196039685,2018-07-16,Miscellaneous,True,Science and Engineering,[misc]


## Assign Topic when Appropriate

### With Only Headline 

In [7]:
df_only_headline = df_tweets[(df_tweets["sub-headline"].str.len() == 0) & (df_tweets["tags"].apply(
    lambda x: len(set(["misc", "learning", "tool", "dataviz", "research"]).intersection(set(x))) == 0
))]
headline_date_count = df_only_headline.groupby(["headline", "date"]).size()
headline_date_count[headline_date_count > 1]

headline                                           date      
"truly-ism"                                        2018-06-02    2
#CraftyDataViz                                     2018-06-01    2
10k Layer Vanilla CNN                              2018-06-14    2
2018 Developer Survey                              2018-05-30    2
2nd Youtube 8M dataset competition                 2018-05-23    2
AI Tulips                                          2018-06-01    3
AI and Compute                                     2018-05-27    4
Academics Writing Code                             2018-07-15    2
Add AI generated sound to AI generated paintings   2018-05-21    3
All-star panel @NAACLHLT                           2018-06-05    2
Amazon Selling Face-recognition tech to Police     2018-05-22    3
AutoAugment                                        2018-06-04    3
Avoiding "Clique Culture"                          2018-07-03    3
Bernoulli data                                     2018-05-25    3


In [8]:
df_only_headline[df_only_headline["tags"].apply(lambda x: "nlp" in set(x))].groupby(["headline", "date"]).size()

headline                                  date      
NLP                                       2018-05-21    2
NLP Decathlon                             2018-06-20    4
NLP News                                  2018-06-11    1
NLP Newsletter                            2018-06-25    1
NLP's ImageNet Moment                     2018-07-08    2
Pre-trained Transformer in PyTorch (NLP)  2018-06-14    1
dtype: int64

In [9]:
df_only_headline[df_only_headline["tags"].apply(lambda x: "rstats" in set(x))].groupby(["headline", "date"]).size()

headline  date      
rstats    2018-06-20    3
dtype: int64

In [10]:
headline_unique_date_count = headline_date_count.groupby("headline").size()
headline_unique_date_count[headline_unique_date_count > 1]

headline
Ethics       2
Notables     7
Resources    2
dtype: int64

In [11]:
blacklist = set(["NLP", "Notables", "rstats"])
headlines = list(set(headline_date_count[headline_date_count > 1].index.get_level_values(0)) - blacklist)
for headline in headlines:
    tweet_idx = df_tweets[df_tweets.headline == headline].index
    assert tweet_idx.shape[0] > 1
    df_tweets.loc[tweet_idx[1:], "parent_tid"] = tweet_idx[0]
df_tweets[~df_tweets.parent_tid.isnull()].head()

Unnamed: 0_level_0,date,headline,no_conversation,sub-headline,tags,parent_tid
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1009979751430270976,2018-06-21,Resources,False,,[],1009923776350474241
1009947919531855872,2018-06-21,Resources,False,,[],1009923776350474241
1009445101788114945,2018-06-21,Resources,False,Gensim Doc2Vec Bug Fixed,[],1009923776350474241
1009832281274322944,2018-06-21,Resources,False,Microsft Research Open Data,[research],1009923776350474241
1010037304642031616,2018-06-21,Miscellanous,False,,[],1009905083918712833


### Sub-headline

In [11]:
df_tweets[df_tweets["headline"] == "#UseR2018"]

Unnamed: 0_level_0,date,headline,no_conversation,sub-headline,tags
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1017750890583711744,2018-07-12,#UseR2018,False,R for Psychological Science?,[]
1017579549435912194,2018-07-12,#UseR2018,False,Keynote,[]
1018377414852767744,2018-07-12,#UseR2018,False,Keynote,[]
1017842613641269250,2018-07-12,#UseR2018,False,Keynote,[]
1017518220645986305,2018-07-12,#UseR2018,False,fasster,[]


In [12]:
df_sub_headline = df_tweets[(df_tweets["sub-headline"].str.len() > 0)]
sub_headline_date_count = df_sub_headline.groupby(["headline", "sub-headline", "date"]).size()
sub_headline_date_count[sub_headline_date_count > 1].index.get_level_values(1)

Index(['Keynote', 'Bias', 'Gender', 'Crypto.ai', 'Ramen dishes',
       '"Director of AI"', 'Automatic Essay Grading', 'Autopsy of a DL Paper',
       'Ethics-related', 'GCP Slashing GPU Prices', 'Github and Open-source',
       'Lebron Memes', 'Privacy Issues', 'Python 3.7',
       'Science and Engineering', 'Scientists on Twitter', 'Thoughts',
       'Thoughts', 'Thoughts on Facial Recognition', '“Soft Skills”', 'AI',
       'Transfer Learning', 'OpenAI', 'Similarity Between NN Representations',
       'Causal Models', 'GANs', 'Fair Classification', 'Duckietown',
       'NIPS2018 Pommerman Competition', 'Reproducing papers',
       'Compare GAN Code', 'CoordConv', 'DARTS',
       'Differentiable Dynamic Programming', 'Dota (OpenAI)',
       'Evaluating Feature Importance Estimates',
       'Gradient Acceleration in Activation Functions',
       'Guided Evolutionary Strategies', 'Meseauring Abstract Reasoning',
       'ResNet with One-neuron Hidden Layers', 'SwitchNorm',
       'Synth

In [14]:
blacklist = ["#rstats", "rstats", "Python", "Datasets", "GANs", "AI", "Causal Models", "Thoughts"]
sub_headline_date_count = sub_headline_date_count[
    (sub_headline_date_count > 1) & ([
       x not in blacklist for x in sub_headline_date_count.index.get_level_values(1).tolist()])]
sub_headline_date_count

headline                 sub-headline                                   date      
#UseR2018                Keynote                                        2018-07-12    3
Ethics                   Bias                                           2018-06-13    3
                         Gender                                         2018-06-13    4
GAN                      Crypto.ai                                      2018-05-21    3
                         Ramen dishes                                   2018-05-21    2
Miscellaneous            "Director of AI"                               2018-05-29    3
                         Automatic Essay Grading                        2018-07-01    2
                         Autopsy of a DL Paper                          2018-07-16    4
                         Ethics-related                                 2018-06-27    6
                         GCP Slashing GPU Prices                        2018-06-18    2
                         Github and O

In [15]:
headlines = sub_headline_date_count.index.get_level_values(0).tolist()
sub_headlines = sub_headline_date_count.index.get_level_values(1).tolist()
for headline, sub_headlines in zip(headlines, sub_headlines):
    tweet_idx = df_tweets[
        (df_tweets.headline == headline) & (df_tweets["sub-headline"] == sub_headlines)].index
    assert tweet_idx.shape[0] > 1
    df_tweets.loc[tweet_idx[1:], "parent_tid"] = tweet_idx[0]
df_tweets[~df_tweets.parent_tid.isnull() & (df_tweets["sub-headline"].str.len() > 0)].head()

Unnamed: 0_level_0,date,headline,no_conversation,sub-headline,tags,parent_tid
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1009853749446172672,2018-06-21,Notable Research,False,Similarity Between NN Representations,[research],1009848060963368964
1009445101788114945,2018-06-21,Resources,False,Gensim Doc2Vec Bug Fixed,[],1009923776350474241
1009832281274322944,2018-06-21,Resources,False,Microsft Research Open Data,[research],1009923776350474241
1009965708443291648,2018-06-21,Miscellanous,False,AI,[],1009905083918712833
1009869167850209280,2018-06-21,Miscellanous,False,AI,[],1009905083918712833


In [16]:
df_tweets[~df_tweets.parent_tid.isnull()].shape[0] / df_tweets.shape[0] * 100

18.913196894848273

In [17]:
df_tweets.shape[0]

1417

### Dump Results

In [18]:
df_tweets.reset_index().to_pickle("../tweets_extended.pkl")

## Visualize

In [19]:
tweets_by_date = df_tweets.groupby("date").size()
data = [
    go.Bar(
        x=tweets_by_date.index.tolist(),
        y=tweets_by_date.values,
        marker=dict(
            color=[
                'rgba(222,45,38,0.8)' if d.weekday() > 4 else 'rgba(204,204,204,1)'
                for d in tweets_by_date.index.tolist()
            ]
        ),
        name='Tweets'       
    )  
]
layout = go.Layout(
    title='# of Collected Tweets per Day',
    autosize=False,
    width=900,
    height=300,
    margin=go.Margin(
    #     l=50,
    #     r=50,
      b=50,
      t=50,
    #     pad=4
    )
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

## Collect oembed

In [20]:
endpoint = "https://api.twitter.com/1.1/statuses/oembed.json?id={tid}&omit_script=0&maxwidth=500"
res = requests.get(endpoint.format(tid=df_tweets.index[0]))
res

<Response [200]>

In [21]:
res = res.json()
res

{'author_name': 'Nando de Freitas',
 'author_url': 'https://twitter.com/NandoDF',
 'cache_age': '3153600000',
 'height': None,
 'html': '<blockquote class="twitter-tweet" data-width="500"><p lang="en" dir="ltr">Generative design —- this could be promising, and certainly fun <a href="https://t.co/mFoUT1LA1P">https://t.co/mFoUT1LA1P</a></p>&mdash; Nando de Freitas (@NandoDF) <a href="https://twitter.com/NandoDF/status/1009921622801776640?ref_src=twsrc%5Etfw">June 21, 2018</a></blockquote>\n<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>\n',
 'provider_name': 'Twitter',
 'provider_url': 'https://twitter.com',
 'type': 'rich',
 'url': 'https://twitter.com/NandoDF/status/1009921622801776640',
 'version': '1.0',
 'width': 500}

In [22]:
bs = BeautifulSoup(res["html"], "html.parser")
" ".join([str(x) for x in bs.find("blockquote").children])

'<p dir="ltr" lang="en">Generative design —- this could be promising, and certainly fun <a href="https://t.co/mFoUT1LA1P">https://t.co/mFoUT1LA1P</a></p> — Nando de Freitas (@NandoDF)  <a href="https://twitter.com/NandoDF/status/1009921622801776640?ref_src=twsrc%5Etfw">June 21, 2018</a>'

In [23]:
def collect_oembed(tid):
    res = requests.get(endpoint.format(tid=tid)).json()
    if "html" not in res:
        print(tid, res)
        return None
    res = res["html"]
    bs = BeautifulSoup(res, "html.parser")
    return " ".join([str(x) for x in bs.find("blockquote").children])

In [24]:
oembeds = Parallel(n_jobs=8)(delayed(collect_oembed)(tid) for tid in tqdm_notebook(df_tweets.index.tolist()))

HBox(children=(IntProgress(value=0, max=1417), HTML(value='')))

1006227429315076098 {'errors': [{'message': 'Sorry, that page does not exist', 'code': 34}]}
988796688402759683 {'errors': [{'message': 'Sorry, that page does not exist', 'code': 34}]}
1012328835293929472 {'errors': [{'message': 'Sorry, that page does not exist', 'code': 34}]}
1008697843979255808 {'errors': [{'message': 'Sorry, that page does not exist', 'code': 34}]}
1012063547130597376 {'errors': [{'message': 'Sorry, that page does not exist', 'code': 34}]}
1004142020363997185 {'errors': [{'message': 'Sorry, that page does not exist', 'code': 34}]}
1003411204402864128 {'errors': [{'message': 'Sorry, that page does not exist', 'code': 34}]}
1002890192372551685 {'errors': [{'message': 'Sorry, that page does not exist', 'code': 34}]}
1014169647103475712 {'errors': [{'message': 'Sorry, that page does not exist', 'code': 34}]}
1003820989443026944 {'errors': [{'message': 'Sorry, that page does not exist', 'code': 34}]}
1010048143403208704 {'errors': [{'message': 'Sorry, that page does not 

In [25]:
df_tweets["oembed"] = oembeds
df_tweets.sample()

Unnamed: 0_level_0,date,headline,no_conversation,sub-headline,tags,parent_tid,oembed
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1006916255926845440,2018-06-13,Ethics,False,Gender,[ethics],1006951012299608066,"<p dir=""ltr"" lang=""en"">this is by far the best..."


In [26]:
df_tweets[df_tweets.oembed.isnull()]

Unnamed: 0_level_0,date,headline,no_conversation,sub-headline,tags,parent_tid,oembed
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1006227429315076098,2018-06-11,Tutorials and Resources,False,,"[learning, tutorial]",,
988796688402759683,2018-05-27,Miscellaneous,False,,[misc],,
1012328835293929472,2018-06-28,Miscellaneous,False,,[misc],,
1008697843979255808,2018-06-18,Miscellaneous,False,,[misc],,
1012063547130597376,2018-06-26,Resources,False,[dataviz] Slopegraph,[dataviz],1.0116323228720742e+18,
1004142020363997185,2018-06-05,[ethics] Facebook Sharing Data,False,,[ethics],1.0041802312997806e+18,
1003411204402864128,2018-06-03,Microsoft Buys Github,False,,[],4.308712678816727e+17,
1002890192372551685,2018-06-02,Tutorials and Resources,False,,"[learning, tutorial]",,
1014169647103475712,2018-07-04,Research,False,Adversarial Reprogramming of NN,[research],,
1003820989443026944,2018-06-04,Miscellaneous,False,,[misc],,


In [27]:
## Dump Results
df_tweets.reset_index().to_pickle("../tweets_extended.pkl")

## Collect More Information

In [39]:
df_tweets = pd.read_pickle("../tweets_extended.pkl").set_index("tid")

In [46]:
from credentials import *
import twitter
if os.environ.get("LOCAL", False):
    PROXIES = dict(http='socks5://192.168.199.10:12133',
                   https='socks5://192.168.199.10:12133')
else:
    PROXIES = None
API = twitter.Api(
    consumer_key=CONSUMER_KEY,
    consumer_secret=CONSUMER_SECRET,
    access_token_key=ACCESS_TOKEN,
    access_token_secret=ACCESS_TOKEN_SECRET,
    tweet_mode="extended",
    proxies=PROXIES,
    sleep_on_rate_limit=True
)

In [47]:
API.InitializeRateLimit()

In [48]:
API.rate_limit.get_limit("/statuses/show/:id")

EndpointRateLimit(limit=900, remaining=616, reset=1532361210)

In [49]:
tmp = API.GetStatus(df_tweets.index[0])

In [25]:
tmp.full_text

'Generative design —- this could be promising, and certainly fun https://t.co/mFoUT1LA1P'

In [26]:
tmp.created_at_in_seconds

1529619052

In [27]:
tmp.user

User(ID=29843511, ScreenName=NandoDF)

In [53]:
oembed_null = df_tweets.oembed.isnull()
if "author" in df_tweets.columns:
    author_null = df_tweets.author.isnull()
else:
    author_null = True
    
for i in tqdm_notebook(df_tweets.index.tolist()):
    if oembed_null[i]:
        continue
    if author_null is not True and not author_null[i]:
        continue
    status = API.GetStatus(i)
    df_tweets.loc[i, "author"] = status.user.screen_name
    df_tweets.loc[i, "timestamp"] = status.created_at_in_seconds
    df_tweets.loc[i, "reply_to_tid"] = status.in_reply_to_status_id
    df_tweets.loc[i, "reply_to_sname"] = status.in_reply_to_screen_name

HBox(children=(IntProgress(value=0, max=1417), HTML(value='')))




Exception in thread Thread-9:
Traceback (most recent call last):
  File "/home/ceshine/miniconda3/envs/deep/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/ceshine/miniconda3/envs/deep/lib/python3.6/site-packages/tqdm/_monitor.py", line 63, in run
    for instance in self.tqdm_cls._instances:
  File "/home/ceshine/miniconda3/envs/deep/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






In [54]:
print(df_tweets.reset_index().drop_duplicates("tid").shape[0])
df_tweets.shape[0]

1417


1417

In [56]:
df_tweets.sample(10)

Unnamed: 0_level_0,date,headline,no_conversation,sub-headline,tags,parent_tid,oembed,author,timestamp,reply_to_tid,reply_to_sname
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1007900787140366336,2018-06-16,Miscellaneous,False,,[misc],,"<p dir=""ltr"" lang=""en"">Artificial intelligence...",NandoDF,1529137000.0,,
1000741704457072641,2018-05-30,Notables,False,,[],,"<p dir=""ltr"" lang=""en"">This paper by <a href=""...",RogerGrosse,1527430000.0,,
999699350669352965,2018-05-25,Notables,False,,[],,"<p dir=""ltr"" lang=""en"">My paper on stochastic ...",cboettig,1527182000.0,,
1003692923106549760,2018-06-04,Overfitting?,False,,[],1.0035351673639116e+18,"<p dir=""ltr"" lang=""en"">“Our sense of progress ...",hardmaru,1528134000.0,,
1002259952336064512,2018-05-31,Data Visualization,False,,[dataviz],,"<p dir=""ltr"" lang=""en"">ICYMI, 📊short-course &a...",dataandme,1527792000.0,,
998564220341022722,2018-05-21,Visualization,False,,[dataviz],,"<p dir=""ltr"" lang=""en"">Beautiful and Powerful ...",Rbloggers,1526911000.0,,
1000488820372996096,2018-05-26,Miscellaneous,False,,[misc],,"<p dir=""ltr"" lang=""en"">not a huge fan of the t...",karpathy,1527370000.0,,
1007755817435553792,2018-06-17,Miscellaneous,False,,[misc],,"<p dir=""ltr"" lang=""en"">When I first began lear...",pranaygp,1529103000.0,,
1006238450213883905,2018-06-11,Miscellaneous,False,,[misc],,"<p dir=""ltr"" lang=""en"">'Luck is statistics tak...",DataSciFact,1528741000.0,,
1012971830947274752,2018-06-30,Tutorials / Reviews,False,Explaining Model Prediction using SHAP,"[review, learning, tutorial]",,"<p dir=""ltr"" lang=""en"">How to tell what your t...",radekosmulski,1530346000.0,,


In [57]:
## Dump Results
df_tweets.reset_index().to_pickle("../tweets_extended.pkl")