In [5]:
import numpy as np
import pandas as pd
from collections import defaultdict

## Acquiring NeurIPS tweets, 2017-2020:

### Getting tweet URLs with snscrape:

In [2]:
%%bash
# snscrape --max-results 5000 --since 2020-05-27 twitter-search neurips > neurips_search_5000.txt
# Takes ~30 s on a fast connection. Only run if this is your first time through the notebook!
# May 27 is the first paper submission deadline. Because snscrape does not offer "until," I cannot
# use snscrape to timebox other years; hence I use a hashtag.

In [3]:
%%bash
# snscrape --max-results 5000 twitter-hashtag neurips2020 > neurips2020_tag_5000.txt

In [4]:
%%bash
# snscrape --max-results 5000 twitter-hashtag neurips2019 > neurips2019_tag_5000.txt
# snscrape --max-results 5000 twitter-hashtag neurips2018 > neurips2018_tag_5000.txt
# snscrape --max-results 5000 twitter-hashtag nips2018 > nips2018_tag_5000.txt
# snscrape --max-results 5000 twitter-hashtag nips2017 > nips2017_tag_5000.txt

Let's see who was most active on Twitter in 2019:

In [9]:
tweet_list = open("neurips2019_tag_5000.txt").readlines()

username_dict = defaultdict(int)
for line in tweet_list:
    username = line.replace("https://twitter.com/", "").split('/')[0]
    username_dict[username]+=1

user_df = pd.DataFrame([[k, v] for k, v in username_dict.items()], 
                       columns=['id', 'count'])

In [10]:
user_df.sort_values("count", ascending=False).head()

Unnamed: 0,id,count
544,bgalbraith,161
557,JFutoma,123
328,RahelJhirad,79
95,MeetVancouver,67
708,BioNLProc,45


In [11]:
tweet_url = pd.read_csv("neurips2019_tag_5000.txt", index_col= None, header = None, names = ["links"])
af = lambda x: x["links"].split("/")[-1]
tweet_url['id'] = tweet_url.apply(af, axis=1)
tweet_url.head()

Unnamed: 0,links,id
0,https://twitter.com/wolfgangb33r/status/132586...,1325867864264478721
1,https://twitter.com/PhRMA/status/1309601567646...,1309601567646060544
2,https://twitter.com/adityakusupati/status/1318...,1318751035779739649
3,https://twitter.com/ab_kimiya/status/131609629...,1316096294520266755
4,https://twitter.com/shinyML/status/13125621781...,1312562178143248386


### Retrieving actual tweets using tweepy and the Twitter API:

In [12]:
import tweepy

consumer_key = "FRVGuMgEuhJNQxcMUgLT3MMao" 
consumer_secret = "rFWAW4A4yys6jvPcGdBY4K5wNfFA9Iy2BIQMqr9yhS2I9tYxn8" 
access_token = "939281442621743104-MgHDnMelzUZHeh2Sui0nkJ1oTYfcWHy" 
access_token_secret = "sjQkY0dye7IPJB4MmkSRqatN49gA7n7bNpgdR7liUeLOf"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [40]:
ids = tweet_url['id'].tolist()
total_count = len(ids)
chunks = (total_count - 1) // 50 + 1

def fetch_tw(ids):
    list_of_tw_status = api.statuses_lookup(ids, tweet_mode= "extended")
    empty_data = pd.DataFrame()
    for status in list_of_tw_status:
        # print(status.id)
        tweet_elem = {"tweet_id": int(status.id),
                    "screen_name": status.user.screen_name,
                    "tweet":status.full_text,
                    "date":status.created_at}
        # print(tweet_elem["tweet_id"])
        empty_data = empty_data.append(tweet_elem, ignore_index = True)
    empty_data['tweet_id'] = empty_data['tweet_id'].astype(int)
    empty_data.to_csv("neurips2019_tweets.csv", mode="a")

# Only run once!

for i in range(chunks):
    batch = ids[i*50:(i+1)*50]
    result = fetch_tw(batch)

Let's move those tweets into Pandas for analysis:

In [48]:
df_2019 = pd.read_csv("neurips2019_tweets.csv", index_col=0).reset_index(drop = True)
df_2019['date'] = pd.to_datetime(df_2019['date'], infer_datetime_format=True, errors='coerce')
df_2019 = df_2019[df_2019.tweet_id != 'tweet_id'] # removes rows with missing tweet_id
df_2019['tweet_id'] = df_2019['tweet_id'].astype(int)
df_2019.head(3)

Unnamed: 0,date,screen_name,tweet,tweet_id
0,2020-06-15 18:04:04,goyucel,Dünya #YapayZeka #AI Yetenek Takibi olarak \n@...,1272590715344879616
1,2020-10-04 01:16:24,shinyML,rl for chip design 🙌 everyone’s fave game #neu...,1312562178143248384
2,2020-06-14 03:01:00,KirkDBorne,Analysis of #NeurIPS2019 papers by themes: htt...,1272001061054799872


In [50]:
df_2019.dtypes

date           datetime64[ns]
screen_name            object
tweet                  object
tweet_id                int64
dtype: object

In [51]:
df_2019.shape

(5000, 4)

In [53]:
df_2019.to_pickle("./df_2019.pkl")