In [2]:
import time
import glob
import json
import os.path
import requests
import pandas as pd
from  datetime import date

In [3]:
today = date.today()

In [4]:
files = glob.glob('../data/users_followers_*_2023-05-02.csv')
all_users = []
for file in files:
    users = pd.read_csv(file, sep='\t', names=['user', 'instance', 'followers', 'field'], encoding_errors='replace', on_bad_lines='warn')
    all_users.append(users)

all_users = pd.concat(all_users)
print(all_users.head())

Skipping line 50: expected 4 fields, saw 5



               user            instance   
0           rheinze        assemblag.es  \
1          BrianJAP      mastodon.world   
2       joelglasman     fediscience.org   
3  brenton_peterson     sciences.social   
4            keremd  social.anoxinon.de   

                                           followers            field  
0  [109417147220906349, 109243458234695495, 10924...  African Studies  
1  [1302832, 110023631275744313, 1099309840163259...  African Studies  
2                                                 []  African Studies  
3  [107192220324249637, 109354941450990518, 10951...  African Studies  
4  [1302832, 109212672053987101, 1092459114371518...  African Studies  


In [5]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)


def get_userid(user_name, user_server, token_access):
    try:
        url = 'https://{}/api/v1/accounts/lookup/'.format(user_server)
        headers = {
            'Authorization' : 'Bearer {}'.format(token_access)
        }
        params = {
            'acct' : '{}@{}'.format(user_name, user_server)
        }
        r = session.get(url, headers=headers, params=params)

        user = json.loads(r.text)
        if 'id' in user:
            return user["id"]
        elif 'user' in user:
            return user["user"]
    except Exception as error:
        print(error)
        return None

In [8]:
# mastodon_app = "acmbrito_app_python_secret"
# client_id, client_secret, access_token, api_base_url = open(mastodon_app).read().split()

In [6]:
date_limit = date(2022, 9, 1)

In [7]:
import util
def get_instance_token(instance):
    return util.get_instance_token(instance)

In [13]:
# toots

# source to collect user toots: https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/
# API code: https://mastodonpy.readthedocs.io/en/1.8.1/_modules/mastodon/timeline.html?highlight=users%20in%20a%20given%20list
# https://docs.joinmastodon.org/methods/search/#v1

def user_toots(instance, user_id, access_token):
    URL = 'https://{}/api/v1/accounts/{}/statuses'.format(instance, user_id)
    user_toots = []
    if access_token:
        headers = {
            'Authorization' : 'Bearer {}'.format(access_token)
        }
        
        user_toots = []
        params = {}
        while True:
            #time.sleep(1)
            r = requests.get(URL, headers=headers, params=params, timeout=60)
            toots = json.loads(r.text)
            user_toots += toots
            if len(toots) == 0:
                break
            if 'error' in toots:
                print(user_id, toots)
                break
            max_id = toots[-1]['id']
            params = {'max_id' : max_id}
            
            date_str = toots[-1]['created_at'].split('T')[0]
            last_date = date.fromisoformat(date_str)
            if last_date < date_limit:
                break

    return user_toots

In [8]:
valid_fields = ['id',
'created_at',
'in_reply_to_account_id',
'uri',
'replies_count',
'reblogs_count',
'favourites_count',
'content'
]

In [9]:
files = glob.glob('temp/users_toots_2023-0*.tsv')
users_already_collected = []
invalid_users = set()
for file in files:
    invalid_users |= set(pd.read_csv(file, sep='\t')['user_id'])

print(len(invalid_users))

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
outputs = []
size = 0
user_id_map = pd.read_csv('user_id_list.tsv', sep='\t')
user_id_map = user_id_map.drop_duplicates(subset=['user', 'instance'])
user_id_map = user_id_map.dropna()

for _, user in all_users.iterrows():
    
    user_id = user_id_map[(user_id_map['user'] == user['user']) & (user_id_map['instance'] == user['instance'])]
    if len(user_id) >= 1:
        user_id = user_id['user_id'].values[0]
    
    else:
        continue

    if not user_id:
        print(user)
        continue

    if not user_id.isnumeric():
        continue

    if int(user_id) in invalid_users:
        continue
    
    instance_token = get_instance_token(user['instance'])
    if not instance_token:
        continue

    toots = user_toots(user['instance'], user_id, instance_token)
    if len(toots) > 0:
        toots_pd = pd.json_normalize(toots)
        toots_pd = toots_pd.loc[:, valid_fields]
        toots_pd['user_id'] = user_id
        size += len(toots_pd)
        outputs.append(toots_pd)

    if size > 10000:
        data_toots = pd.concat(outputs)
        data_toots.to_csv('temp/users_toots_{}_{}.tsv'.format(today, user_id), sep='\t')
        outputs = []
        size = 0

data_toots = pd.concat(outputs)
data_toots.to_csv('temp/users_toots_{}_{}.tsv'.format(today, user_id), sep='\t')

In [None]:
#TODO to select instances related to scholars/science
#questions: search for scholars on mastodon.social? (the "main" instance)

# o tamanho de um subconjunto para selecionar?????
# fediscience (onde todos os usuarios sao academicos) como primeiro subset
# como expandir isso para o mais instancias?
# 

In [None]:
ids = []
i = 0
'''
for _, user in all_users.iterrows():
    instance_token = get_instance_token(user['instance'])
    user_id = get_userid(user['user'], user['instance'], instance_token)
    ids.append((user['user'], user['instance'], user_id))
    i += 1
    if i % 300 == 0:
        print(i, end='\r')
        pd.DataFrame(ids, columns=['user', 'instance', 'user_id']).to_csv('user_id_list.tsv', sep='\t')
'''


In [10]:
filename = '../data/mastodon_users_wOpenAlex.csv'
openalex_masto = pd.read_csv(filename, sep=',')
print(openalex_masto.columns)
openalex_masto = openalex_masto.drop(columns='Unnamed: 0').drop_duplicates()
print(openalex_masto.size)
openalex_masto.head()

Index(['Unnamed: 0', 'mastodon_name'], dtype='object')
4677


Unnamed: 0,mastodon_name
0,@HelmutBuergmann@mstdn.science
1,@boshek@fosstodon.org
2,@DenisDuboule@mas.to
3,@paulgkeil@mastodon.world
4,@rider_jon@zirk.us


In [11]:
ids = []
i = 0

for _, user in openalex_masto.iterrows():
    mastodon_name = user['mastodon_name'].split("@")[1:]
    user_name = mastodon_name[0]
    instance_name = mastodon_name[1]
    instance_token = get_instance_token(instance_name)
    user_id = get_userid(user_name, instance_name, instance_token)
    ids.append((user_name, instance_name, user_id))
    i += 1
    if i % 100 == 0:
        print(i, end='\r')
        pd.DataFrame(ids, columns=['user', 'instance', 'user_id']).to_csv('user111_id_list_kunilist_{}.tsv'.format(today), sep='\t')


HTTPSConnectionPool(host='fediscience.com', port=443): Max retries exceeded with url: /api/v1/accounts/lookup/?acct=crisvanhout%40fediscience.com (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000263F3840150>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
HTTPSConnectionPool(host='mastodon.soc', port=443): Max retries exceeded with url: /api/v1/accounts/lookup/?acct=siruiwan%40mastodon.soc (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000263F37C6DD0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
HTTPSConnectionPool(host='mstdn.scien', port=443): Max retries exceeded with url: /api/v1/accounts/lookup/?acct=KeyesDiscovery%40mstdn.scien (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000263F384F150>: Failed to establish a new connection: [Errno 11001

In [14]:
outputs = []
size = 0
user_id_map = pd.read_csv('user111_id_list_kunilist_2023-06-30.tsv', sep='\t', dtype=object)
user_id_map = user_id_map.drop_duplicates(subset=['user', 'instance'])
user_id_map = user_id_map.dropna()
#user_id_map = user_id_map.astype({'user_id':'int'})
print(user_id_map.head())

for _, user in user_id_map.iterrows():
    '''print(user)
    item = user['mastodon_name'].split('@')
    
    user = {
        'user': item[1],
        'instance': item[2]
    }'''

    user_id = user_id_map[(user_id_map['user'] == user['user']) & (user_id_map['instance'] == user['instance'])]
    if len(user_id) >= 1:
        user_id = user_id['user_id'].values[0]
    else:
        continue

    if not user_id:
        print(user)
        continue

    if not user_id.isnumeric():
        continue

    # if int(user_id) in invalid_users:
    #    continue
    
    instance_token = get_instance_token(user['instance'])
    if not instance_token:
        continue

    toots = user_toots(user['instance'], user_id, instance_token)
    if len(toots) > 0:
        toots_pd = pd.json_normalize(toots)
        toots_pd = toots_pd.loc[:, valid_fields]
        toots_pd['mastodon_name'] = "{}@{}".format(user['user'], user['instance']) 
        size += len(toots_pd)
        outputs.append(toots_pd)

    if size > 10000:
        data_toots = pd.concat(outputs)
        data_toots.to_csv('temp/users_toots_{}_{}.tsv'.format(today, user_id), sep='\t')
        outputs = []
        size = 0

data_toots = pd.concat(outputs)
data_toots.to_csv('temp/users_toots_openalex_{}_{}.tsv'.format(today, user_id), sep='\t')

  Unnamed: 0             user        instance             user_id
0          0  HelmutBuergmann   mstdn.science  109290116551935157
1          1           boshek   fosstodon.org  109314764090379220
2          2     DenisDuboule          mas.to  109258593858933399
3          3        paulgkeil  mastodon.world  109500238977329432
4          4        rider_jon         zirk.us  109365280591812662
108231873383230758 {'error': 'Too many requests'}
109252294337269013 {'error': 'Too many requests'}
108194402925670785 {'error': 'Too many requests'}
312200 {'error': 'Too many requests'}
109219602453681496 {'error': 'Too many requests'}
109394770510287303 {'error': 'Too many requests'}
109244312739228432 {'error': 'Too many requests'}
109242396345555512 {'error': 'Too many requests'}
108147588127310893 {'error': 'Too many requests'}
109248233331530003 {'error': 'Too many requests'}
109246123143382012 {'error': 'Too many requests'}
109253527096086364 {'error': 'Too many requests'}
1093724552656990

In [18]:
data_toots.to_csv('temp/users_toots_openalex_{}_{}.tsv'.format(today, 'last'), sep='\t')

In [17]:
user_id

Unnamed: 0.1,Unnamed: 0,user,instance,user_id


In [None]:
access_token = get_instance_token('home.social')
user_id = get_userid('fasbrock', 'home.social', access_token)
user_toots('home.social', user_id, access_token)

In [None]:
print(user_id)