In [None]:
!pip install Mastodon.py bs4 scikit-learn plotly

# 1. Register the Mastodon app

The following cell has to be ran only once to register the Mastodon app (you can keep it commented afterwards). The parameters passed to the `create_app` function are:

- the name of your app ("mastimeline" in this case)
- `api_base_url` (the URL of the mastodon server where you have your account)
- the file where you want to store the app credentials

In [None]:
# from mastodon import Mastodon


# Mastodon.create_app(
#      'mastimeline',
#      api_base_url = 'https://fosstodon.org',
#      to_file = 'mastimeline_clientcred.secret'
# )


# 2. Log in

The code in the following cell is used to log into your Mastodon server:

- `client_id` is the name of the file containing the credentials that were generated on the previous step
- login and password are loaded from the file `mastimeline_auth.secret`, a simple plaintext file holding login (email) in the first line and password in the second one
- finally, the client logs in and stores the access token into the file `mastimeline_usercred.secret`

In [None]:
from mastodon import Mastodon

mastodon = Mastodon(
    client_id = 'mastimeline_clientcred.secret',
)

with open("mastimeline_auth.secret", "rt") as f:
    (login,pw) = f.read().split("\n")

mastodon.log_in(
    login,
    pw,
    to_file = 'mastimeline_usercred.secret'
)

# 3. Create an actual API instance

The following code instantiates a Mastodon API client using the user credentials generated in the previous step. The commented line posts a toot from your account, feel free to uncomment it for testing purposes :-)

In [None]:
mastodon = Mastodon(
    access_token = 'mastimeline_usercred.secret',
)
# mastodon.toot('Tooting from Python using #mastodonpy, mwahahahah! >:-)')

# 4. Get the revchron timeline

In [None]:
tl = mastodon.timeline_home()

# 4.1 Print toots' contents

As toots' content is HTML, the following code uses BeautifulSoup to decode it and print it as plain text. Note that in the case of boosts the `toot.content` field will be empty.

In [None]:
from bs4 import BeautifulSoup

# print the content of some toots in the timeline
for toot in tl:
    soup = BeautifulSoup(toot.content)
    text = soup.get_text()
    # note some toots might look empty as the "content" field does not work for boosts
    if text != "":
        print(text) 

In [None]:
# print the content - but better :-)
# (the content of boosts appears in the `toot.reblog.content` field so we get it from there)

for toot in tl:
    id = toot.id
    cont = toot.content
    if toot.reblog:
        id = toot.reblog.id
        cont = toot.reblog.content
    soup = BeautifulSoup(cont)
    print(f"{id}: {soup.get_text()}")

# 5. Understand pagination

The `tl` object has two `_pagination_prev` and `_pagination_next` attributes which provide pagination information:
- `_pagination_prev` has a `min_id` value which provides the smallest toot id in the current data chunk
- `_pagination_next` has a `max_id` value which provides the largest toot id in the current data chunk

Starting from the current tl content (which contains the most recent toots), we can paginate back in time by taking the current `min_id` and asking for the data chunk that ends right before it

In [None]:
tl._pagination_prev

In [None]:
tl._pagination_next

In [None]:
tl = mastodon.timeline_home()
data = tl
i = 0
while len(tl)>0:
    i+=1
    print(f"{i}: {tl._pagination_next.get('max_id')}")
    tl = mastodon.timeline_home(max_id = tl._pagination_next.get('max_id'))
    data.extend(tl)

# 6. Analyze downloaded data

In [None]:
toot_ids = [data[i]['id'] for i in range(len(data))]
print(f"I downloaded {len(toot_ids)} toots")

In [None]:
ids = []
text = []

for toot in data:
    id = toot.id
    cont = toot.content
    if toot.reblog:
        id = toot.reblog.id
        cont = toot.reblog.content
    soup = BeautifulSoup(cont)
    # print(f"{id}: {soup.get_text()}")
    ids.append(id)
    text.append(soup.get_text())

In [None]:
text[:5]

## 6.1 Calculate text embeddigs

In [None]:
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import expit

    
MODEL = f"cardiffnlp/tweet-topic-21-multi"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL, output_hidden_states=True)
class_mapping = model.config.id2label


In [None]:
import time

descr = []
expits = []
lbls = []

tt = time.time()
for t in text:
    i += 1
    if not (i%10): 
        print(".", end="")
    tokens = tokenizer(t, return_tensors='pt')
    output = model(**tokens)

    scores = output[0][0].detach().numpy()
    descr.append(scores)
    scores = expit(scores)
    expits.append(scores)

    lbls.append (np.argmax(scores))


descr = np.array(descr)
lbls = np.array(lbls)
expits = np.array(expits)

# this is the time it takes to calculate 800 embeddings on my 
# 4-cores 2016 Macbook Pro... 3' is not great, but maybe will
# be much smaller on recent hardware (also note that if this
# works nicely we can look for other lighter models too!)
print(time.time()-tt)

## 6.2 Use KDTree to calculate K-nearest neighbors of a given status

In [None]:
from scipy import spatial
tree = spatial.KDTree(descr)

# get the 5 nearest neighbors of descr[42]
idx = tree.query(descr[10], k=5)[1]
for i in idx:
    print(f"{ids[i]}:{text[i]}")

## 6.3 Use TSNE to plot the statuses in a 2D space

In [None]:
from sklearn.manifold import TSNE
import plotly.express as px

# note you can play with the perplexity parameter to have more or less crisp clusters
# (smaller values of perplexity tends to have tighter, more sparse clusters, while 
# larger values return larger, more globular and possibly overlapping ones)
tsne = TSNE(n_components=2, 
            random_state=42,
            perplexity=30
)
projections = tsne.fit_transform(descr)

fig = px.scatter(
    projections, x=0, y=1,
    # when hovering, the matching class is shown
    hover_name = [class_mapping[lbl] for lbl in lbls],
    color=lbls,
)
fig.show()


Note that what we plotted above are the descriptors /before/ they are mapped with the `scipy.expit` function.
For this reason, we also try to directly plot `expits` and see if the plot is more/less meaningful to us 

In [None]:
projections = tsne.fit_transform(np.array(expits))

fig = px.scatter(
    projections, x=0, y=1,
    hover_name = [class_mapping[lbl] for lbl in lbls],
    color=lbls,
)
fig.show()

In [None]:
### look at the differnce between plain descriptor and expits
print(descr[10])
print(expits[10])

# Tests with mean embeddings

We now try to build an embedding for the whole sentence which is the mean of
the embeddings calculated for each token.
Note that other pre-trained sentence-based approaches (e.g. SBERT https://www.sbert.net/) are available and probably way better than this :-) 

In [None]:
t = "Hello world"
tokens = tokenizer(t, return_tensors='pt')

### NOTE: we can skip this in favor of enabling hidden states and taking
### the last set of embeddings (hidden_states[12])
# bebe = model.get_submodule("roberta")
# encoder_outputs = bebe(**tokens)
# sequence_output = encoder_outputs[0]
# sequence_output

output = model(**tokens)
# output.hidden_states[-1][:,0].shape
mean_embedding = output.hidden_states[-1].mean(axis=1)[0].detach().numpy()
mean_embedding.shape

In [None]:
embeddings = []

tt = time.time()
for t in text:
    i += 1
    if not (i%10): 
        print(".", end="")
    tokens = tokenizer(t, return_tensors='pt')
    output = model(**tokens)
    mean_embedding = output.hidden_states[-1].mean(axis=1)[0].detach().numpy()
    embeddings.append(mean_embedding)


embeddings = np.array(embeddings)
print(time.time()-tt)

In [None]:
from scipy import spatial
tree2 = spatial.KDTree(embeddings)

idx = tree2.query(embeddings[404], k=5)[1]
for i in idx:
    print(f"{ids[i]}:{text[i]}")

In [None]:
tsne = TSNE(n_components=2, 
            random_state=42,
            perplexity=30
)
projections = tsne.fit_transform(embeddings)

fig = px.scatter(
    projections, x=0, y=1,
    # here the status index is prepended to the class when hovering, 
    # allowing one to check out its neighbors using the code in the 
    # previous cell (or the next one, if you want to compare mean 
    # "embeddings" with default "descr")
    hover_name = [f"{idx} - {class_mapping[lbl]}" for idx, lbl in enumerate(lbls)],
    color=lbls,
)
fig.show()

In [None]:
from scipy import spatial
tree = spatial.KDTree(descr)

idx = tree.query(descr[404], k=5)[1]
for i in idx:
    print(f"{ids[i]}:{text[i]}")