In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
df_2019 = pd.read_pickle("./df_2019.pkl")

In [3]:
df_2019.head(3)

Unnamed: 0,date,screen_name,tweet,tweet_id
0,2020-06-15 18:04:04,goyucel,Dünya #YapayZeka #AI Yetenek Takibi olarak \n@...,1272590715344879616
1,2020-10-04 01:16:24,shinyML,rl for chip design 🙌 everyone’s fave game #neu...,1312562178143248384
2,2020-06-14 03:01:00,KirkDBorne,Analysis of #NeurIPS2019 papers by themes: htt...,1272001061054799872


In [4]:
from langdetect import detect

# Reduce to only English tweets:
df_2019['lang'] = df_2019['tweet'].apply(lambda row: detect(row))
df_2019 = df_2019[df_2019['lang']=='en']
df_2019.head(3)

Unnamed: 0,date,screen_name,tweet,tweet_id,lang
1,2020-10-04 01:16:24,shinyML,rl for chip design 🙌 everyone’s fave game #neu...,1312562178143248384,en
2,2020-06-14 03:01:00,KirkDBorne,Analysis of #NeurIPS2019 papers by themes: htt...,1272001061054799872,en
3,2020-06-30 14:14:21,arXiv__ml,RT @Xingyu2017: How should we combine multiple...,1277968719969259520,en


## Text Cleaning:

### Option 1: Clean with regex and fit with NLPpipe

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from langdetect import detect
import pickle
from NLPPipe import NLPPipe

In [16]:
import re
import string

# Look for all unicode emoji ranges:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

# Set other cleaning lambdas, in order:
breaks = lambda x: re.sub("\n", " ", x)
links = lambda x: re.sub(r"http\S+", "", x)
hashtag = lambda x: re.sub(r'\B#\w*[a-zA-Z]+\w*','', x)
alphanumeric = lambda x: re.sub("\w*\d\w*", "", x)
punc_lower = lambda x: re.sub("[%s]" % re.escape(string.punctuation), "", x.lower())

df_2019["cleaned_tweet"] = (df_2019["tweet"].apply(breaks)
                                            .apply(links)
                                            .apply(hashtag)
                                            .apply(alphanumeric)
                                            .apply(punc_lower)
                                            .apply(deEmojify)
                            )
df_2019.head(3)

Unnamed: 0,date,screen_name,tweet,tweet_id,lang,cleaned_tweet
1,2020-10-04 01:16:24,shinyML,rl for chip design 🙌 everyone’s fave game #neu...,1312562178143248384,en,rl for chip design everyone’s fave game
2,2020-06-14 03:01:00,KirkDBorne,Analysis of #NeurIPS2019 papers by themes: htt...,1272001061054799872,en,analysis of papers by themes —————————— ...
3,2020-06-30 14:14:21,arXiv__ml,RT @Xingyu2017: How should we combine multiple...,1277968719969259520,en,rt how should we combine multiple auxiliary t...


In [17]:
df_2019.to_pickle("./2019_cleaned.pkl")

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk.tokenize import TreebankWordTokenizer, TweetTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from sklearn.decomposition import NMF

def clean_text(text, tokenizer, stemmer):    
    cleaned_text = []
    for post in text:
        post = re.sub("\n", " ", post) # remove newlines
        post = re.sub(r"http\S+", "", post) # remove links
        post = re.sub(r'\B#\w*[a-zA-Z]+\w*','', post) # remove hashtags
        post = re.sub("\w*\d\w*", "", post)
        post = re.sub("[%s]" % re.escape(string.punctuation), "", post.lower())
        cleaned_words = []
        for word in tokenizer(post):
            low_word = word.lower()
            if stemmer:
                low_word = stemmer.stem(low_word)
            cleaned_words.append(low_word)
        cleaned_text.append(' '.join(cleaned_words))
    return cleaned_text

nlp = NLPPipe(vectorizer=CountVectorizer(), 
              cleaning_function=clean_text,
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer())

corpus = df_2019['tweet'].tolist()
nlp.fit(corpus)
nlp.transform(corpus).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [14]:
nlp.vectorizer.vocabulary_

{'rl': 6926,
 'for': 3053,
 'chip': 1413,
 'design': 2160,
 'everyone': 2740,
 'fave': 2891,
 'game': 3191,
 'analysi': 399,
 'of': 5766,
 'paper': 6007,
 'by': 1183,
 'theme': 8095,
 'rt': 6997,
 'how': 3739,
 'should': 7349,
 'we': 8785,
 'combin': 1589,
 'multipl': 5412,
 'auxiliari': 710,
 'task': 7979,
 'to': 8208,
 'acceler': 58,
 'check': 1378,
 'out': 5914,
 'our': 5912,
 'that': 8077,
 'provid': 6479,
 'principl': 6398,
 'method': 5124,
 'in': 3920,
 'thi': 8119,
 'direct': 2239,
 'code': 1538,
 'davheld': 2012,
 'harjatin': 3533,
 'min': 5193,
 'video': 8667,
 'about': 34,
 'dichotom': 2207,
 'and': 409,
 'gener': 3237,
 'pacbayesian': 5976,
 'binari': 944,
 'activ': 90,
 'deep': 2074,
 'neural': 5542,
 'network': 5536,
 'with': 8908,
 'gaël': 3221,
 'letart': 4658,
 'mlpager': 5283,
 'françoi': 3102,
 'laviolett': 4580,
 'inrialil': 4002,
 'aiucl': 233,
 'universitelav': 8490,
 'bibliometr': 930,
 'top': 8239,
 'most': 5363,
 'cite': 1458,
 'as': 591,
 'today': 8211,
 'pytor

I thought about lemmatizing here, but we've already tokenized, so let's move on.

In [9]:
stop_words = ENGLISH_STOP_WORDS.union(['neruips','neurips2019'])
tfidf = TfidfVectorizer(stop_words=stop_words)
tweet_word_matrix = tfidf.fit_transform(df_2019['cleaned_tweet'])

vocab = tfidf.get_feature_names()
vocab

['aa',
 'aaaaaaand',
 'aaaaand',
 'aaai',
 'aabhas',
 'aaltopml',
 'aaltouniversity',
 'aampm',
 'aanalysis',
 'aanet',
 'aaron',
 'aaroth',
 'aaroths',
 'aarón',
 'aaspuruguzik',
 'aaspuruguziks',
 'ab',
 'abadie',
 'abarbu',
 'abaykov',
 'abbati',
 'abbeel',
 'abbreviation',
 'abd',
 'abdullah',
 'abe',
 'abeba',
 'abebab',
 'abedi',
 'abel',
 'abhinaha',
 'abhinav',
 'abilities',
 'ability',
 'able',
 'abnormalities',
 'aboard',
 'aboutml',
 'abrahamai',
 'abrinc',
 'abs',
 'absolute',
 'absolutely',
 'absorb',
 'absorptive',
 'abstract',
 'abstraction',
 'abstractive',
 'abstracts',
 'absurd',
 'abt',
 'abtec',
 'abtin',
 'abuja',
 'abujaai',
 'abundance',
 'abuse',
 'abusers',
 'abut',
 'academia',
 'academic',
 'academicchatter',
 'academics',
 'academy',
 'acarderera',
 'accelerate',
 'accelerated',
 'accelerating',
 'acceleration',
 'accelerator',
 'accent',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accesible',
 'access',
 'accessed',
 'accessibility',
 'accessibl

In [15]:
nlp.__dict__.keys()

dict_keys(['stemmer', 'tokenizer', 'model', 'cleaning_function', 'vectorizer', '_is_fit'])

### NMF

In [125]:
nmf = NMF(n_components=5)
nmf.fit(tweet_word_matrix)

NMF(n_components=5)

#### Tweet/Topic Matrix

In [None]:
tweet_topic_matrix = nmf.transform(tweet_word_matrix)
tweet_topic_matrix_df = pd.DataFrame(tweet_topic_matrix).add_prefix('topic_')

tweet_topic_matrix_df[['raw_tweets', 'clean_tweets']] = df_2019[['tweet', 'cleaned_tweet']]
tweet_topic_matrix_df.head()

tweet_topic_matrix_df.to_pickle("./2019_tweet_topic.pkl")

#### Word/Topic Matrix

In [132]:
word_topic_matrix_df = pd.DataFrame(nmf.components_, columns=vocab).T.add_prefix('topic_')
word_topic_matrix_df.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4
aa,0.000402,0.0,0.0,0.000511,0.0
aaaaaaand,0.0,0.0,0.0,0.000282,0.0
aaaaand,0.0,0.0,0.0,0.001592,0.0
aaai,0.000404,0.0,0.0,0.002505,0.0
aabhas,0.001047,0.004558,0.0,0.0,0.0


## Topic Interpretation

In [133]:
for tweet in tweet_topic_matrix_df.sort_values(by='topic_0', ascending=False).head(10)['raw_tweets'].values:
    print(tweet)
    print()

word_topic_matrix_df.sort_values(by='topic_0', ascending=False).head(10)

Towards Compositionality in Deep Reinforcement Learning https://t.co/KT6kafVdOC #ai #machinelearning #artificialintelligence #NeurIPS2019 #Spotlight via @NandoDF

We’ll be presenting our work “Deep Active Learning with a Neural Architecture Search” at #NeurIPS2019 on Thursday 10:45. https://t.co/9TtAwEXZnq

nan

Qualcomm #AI Research is at #NeurIPS2019 in booth 110. Drop by to meet our researchers, see our demos, and learn more about the breakthrough research we’re presenting this year. https://t.co/zLMP5YpBNh https://t.co/L18sUCIAVK

@alex_ander @nicholdav My student Iris has a poster at #NeurIPS2019 covering this analysis (also w/Eric Shea-Brown).  We look at two questions.  3/n

1. Prime GPT-2 with a list of #NeurIPS2019 poster titles.
2. Use it to complete Sesame Street episode titles.
Results: https://t.co/3AGXQSs8Bn https://t.co/QhobwsXil5

“We named it the Electome because, frankly, we always wanted to name something” @KafuiDzirasa digging into the analysis of the depression-ind

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4
poster,1.824167,0.0,0.0,0.0,0.0
session,0.895198,0.0,0.0,0.0,0.0
come,0.660934,0.0,0.0,0.081317,0.0
hall,0.58728,0.003009,0.0,0.0,0.0
today,0.519363,0.0,0.0,0.166805,0.0
exhibition,0.518377,0.0,0.0,0.0,0.001343
east,0.509994,0.0,0.0,0.0,0.007671
presenting,0.40381,0.030699,0.008169,0.141795,0.0
work,0.350358,0.028223,0.0,0.301514,0.0
pm,0.331109,0.001417,0.000734,0.0,0.0


In [134]:
def top_tweets(tweet_topic_matrix_df, topic, n_tweets):
    return (tweet_topic_matrix_df
            .sort_values(by=topic, ascending=False)
            .head(n_tweets)['raw_tweets']
            .values)

def top_words(word_topic_matrix_df, topic, n_words):
    return (word_topic_matrix_df
            .sort_values(by=topic, ascending=False)
            .head(n_words))[topic]

In [135]:
for tweet in top_tweets(tweet_topic_matrix_df, 'topic_1', 5):
    print(tweet)
    print()

Official NeurIPS 2019 Paper Awards まとめ
https://t.co/gt6UuYNLVo?
#NeurIPS2019

#NeurIPS2019 via NodeXL https://t.co/DJaLPOimkE
@celestekidd
@neuripsconf
@math_rachel
@ceobillionaire
@msftresearch
@wimlworkshop
@googleai
@fmfrancoise
@facebookai
@pytorch

Top hashtags:
#neurips2019
#ai
#neurips
#machinelearning
#wiml2019
#ml
#latinxinai

New research from #intelAI illuminates how speech patterns are processed inside neural networks based on theoretical research by @s_y_chung. Learn more in this paper presented at #NeurIPS2019: https://t.co/Fw9gQbZTT3 https://t.co/d8OfqAksJz

getting ready for the #ml4health workshop at #NeurIPS2019 (west ballroom A!) https://t.co/2PkIzDNVZD

Packed.
We are underway at the #London Machine Learning NeurIPS meetup #NeurIPS2019 #ReinforcementLearning 
Thanks to @GRESEARCHjobs for sponsoring and helping make this event possible #MachineLearning https://t.co/aFXYuxwsj2



Topic 1 seems to refer to Yoshua Bengio's focus on Reinforcement Learning.

In [136]:
top_words(word_topic_matrix_df, 'topic_1', 10)

learning         1.405254
deep             1.042448
yoshua           0.484860
bengio           0.464138
machine          0.404232
bayesian         0.238597
reinforcement    0.193185
workshop         0.172264
talk             0.117270
tutorial         0.100932
Name: topic_1, dtype: float64

In [137]:
for tweet in top_tweets(tweet_topic_matrix_df, 'topic_2', 5):
    print(tweet)
    print()

Trouble with covariance in variational inference for Bayesian neural networks? Try depth instead!

Hear our talk at Bayesian Deep Learning workshop @ 2:15 Friday in West Exhibition Hall C with coauthors @yaringal and Lewis Smith. @OATML_Oxford #NeurIPS2019 https://t.co/fWWobwB3Fk https://t.co/Sug1IrepW6

If you look carefully, you can see @jerrychou82, the whiteboard poster guy, in the art department! #NeurIPS2019 https://t.co/GV7ySm4uX3

If you are at #NeurIPS2019 and curious to know the relation between distributional reinforcement learning and Fourier space, come to my poster today (Tue) at 10:45 AM, East Exhibition Hall B + C, poster No. 207.

#NeurIPS2019 Top participating universities: @mcgillu, @KTHuniversity, @BrownCSDept, @iitroorkee

Found a nice place to do some work at #NeurIPS2019 and saw this attractive fixed point in the water... someone needs to rescale their data before using a normalizing flow... https://t.co/AZWOy4NBxh



In [138]:
top_words(word_topic_matrix_df, 'topic_2', 10)

trials         0.829937
rigorous       0.829617
clinical       0.805712
stakes         0.757064
best           0.715294
big            0.696425
need           0.674251
researchers    0.658735
vaccine        0.144281
tens           0.144281
Name: topic_2, dtype: float64

This topic is a discussion of the mathematical rigor of NeurIPS submissions.

### Option 2: Clean with spaCy

In [139]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [146]:
spacy_tweet = df_2019.iloc[10]['tweet']
doc = nlp(spacy_tweet)

In [147]:
for token in doc:
    print(token.text, token.pos_, token.lemma_, token.is_stop)

Tomorrow NOUN tomorrow False
is AUX be True
the DET the True
day NOUN day False
! PUNCT ! False
Join PROPN Join False
Hyperscience PROPN Hyperscience False
's PART 's True
own ADJ own True
Daniel PROPN Daniel False
Balchev PROPN Balchev False
for ADP for True
an DET an True
# SYM # False
AIClub PROPN AIClub False
Bulgaria PROPN Bulgaria False
webinar NOUN webinar False
on ADP on True
# SYM # False
ML PROPN ML False
as SCONJ as True
a DET a True
maturing VERB mature False
tech NOUN tech False
. PUNCT . False
Discover VERB discover False
ideas NOUN idea False
& CCONJ & False
amp NOUN amp False
; PUNCT ; False
conference NOUN conference False
articles NOUN article False
from ADP from True
# SYM # False
NeurIPS2019 ADV neurips2019 False
related VERB relate False
to ADP to True
improving VERB improve False
the DET the True
reproducibility NOUN reproducibility False
of ADP of True
ML PROPN ML False
articles NOUN article False
in ADP in True
the DET the True
field NOUN field False
. PUNCT . F

In [148]:
doc.text

"Tomorrow is the day! Join Hyperscience's own Daniel Balchev for an #AIClub Bulgaria webinar on #ML as a maturing tech. Discover ideas &amp; conference articles from #NeurIPS2019 related to improving the reproducibility of ML articles in the field. Learn more https://t.co/xohtbkm5BI https://t.co/uLDInTta93"

In [149]:
for sent in doc.sents:
    print(sent)

Tomorrow is the day!
Join Hyperscience's own Daniel Balchev for an #AIClub Bulgaria webinar on #ML as a maturing tech.
Discover ideas &amp; conference articles from #NeurIPS2019 related to improving the reproducibility of ML articles in the field.
Learn more https://t.co/xohtbkm5BI https://t.co/uLDInTta93


In [150]:
from spacy import displacy

In [151]:
displacy.render(doc, style='dep', options={'distance': 80})

In [152]:
df_2019['spacy_tweet'] = list(nlp.pipe(df_2019.tweet))

In [156]:
adj = [token.text.lower() for doc in df_2019.spacy_tweet for token in doc if token.pos_=='ADJ']
noun = [token.text.lower() for doc in df_2019.spacy_tweet for token in doc if token.pos_=='NOUN']

In [157]:
from collections import Counter

In [155]:
Counter(adj).most_common(10)

[('neurips2019', 2491),
 ('great', 310),
 ('more', 251),
 ('deep', 195),
 ('new', 166),
 ('#', 165),
 ('many', 137),
 ('neural', 112),
 ('best', 110),
 ('first', 110)]

In [158]:
Counter(noun).most_common(10)

[('#', 849),
 ('neurips2019', 785),
 ('poster', 540),
 ('workshop', 534),
 ('work', 440),
 ('paper', 379),
 ('talk', 376),
 ('learning', 361),
 ('today', 300),
 ('research', 255)]

In [None]:
spacy.displacy.render(doc, style='ent')