In [75]:
import openai
import numpy as np
import igraph as ig
import pandas as pd
import xnetwork as xnet
from datetime import date
from bs4 import BeautifulSoup

In [137]:
profiles = pd.read_csv('mentioned_profiles_kuni_2023-08-07 01:20:19.437081.tsv', sep='\t')
print(profiles.columns)

Index(['Unnamed: 0', 'id', 'username', 'acct', 'display_name', 'locked', 'bot',
       'discoverable', 'group', 'created_at', 'note', 'url', 'avatar',
       'avatar_static', 'header', 'header_static', 'followers_count',
       'following_count', 'statuses_count', 'last_status_at', 'noindex',
       'emojis', 'roles', 'fields', 'uri', 'moved', 'error', 'limited',
       'suspended'],
      dtype='object')


In [138]:
notes = profiles[['username', 'note']]
notes = notes.fillna('')
concepts = pd.read_csv('OpenAlex concepts in use (17 August 2022) - concepts.tsv', sep='\t')
concepts_lvl = concepts[concepts['level'] < 2]

count = 0
concepts_lvl = concepts_lvl['normalized_name'].dropna().values
mentions_concepts = []
for idx, row in notes.iterrows():
    description = BeautifulSoup(row['note']).text.lower()
    scholar_interests = []
    for concept in concepts_lvl:
        if concept in description:
            scholar_interests.append(concept)
    if len(scholar_interests) > 0:
        count += 1
    mentions_concepts.append(scholar_interests)
print(count, len(mentions_concepts))

1112 2959


In [139]:
concepts_map = dict()

concepts_lvl = concepts[concepts['level'] < 2]
for idx, row in concepts_lvl.iterrows():
    if row['level'] == 1:
        concepts_map[row['normalized_name']] = row['parent_display_names'].lower().split(', ')
    else:
        concepts_map[row['normalized_name']] = [row['normalized_name']]

In [140]:
new_name_concepts = []
for items in mentions_concepts:
    new_name = []
    for item in items:
        new_name += concepts_map[item]
    if len(new_name) > 0:
        new_name_concepts.append(max(new_name))
    else:
        new_name_concepts.append('unknown')

In [141]:
profiles['x_concepts'] = new_name_concepts

In [144]:
profiles = profiles[['username', 'display_name', 'followers_count', 'following_count', 'url', 'x_concepts']]

In [145]:
profiles.head()

Unnamed: 0,username,display_name,followers_count,following_count,url,x_concepts
0,mickeykats,Mikhail Kats,321.0,97.0,https://mathstodon.xyz/@mickeykats,physics
1,josh,Josh Wells,1871.0,786.0,https://fediscience.org/@josh,sociology
2,GuyBirkin,Guy Birkin,626.0,358.0,https://post.lurk.org/@GuyBirkin,art
3,Maurice_vTgln,Maurice van Tiggelen,183.0,42.0,https://fediscience.org/@Maurice_vTgln,physics
4,bnlawrence,Bryan Lawrence,577.0,472.0,https://mastodon.nz/@bnlawrence,unknown


In [146]:
profiles = profiles.dropna(subset='username')

In [147]:
def get_instance(row):
    return "{}".format(row['url'][len("https://"):].split('/')[0])
    
def get_mastodon_name(row):
    instance = get_instance(row)
    return "{}@{}".format(row['username'], instance)

profiles['mastodon_name'] = profiles.apply(get_mastodon_name, axis=1)
profiles['instance'] = profiles.apply(get_instance, axis=1)

In [148]:
profiles.head()

Unnamed: 0,username,display_name,followers_count,following_count,url,x_concepts,mastodon_name,instance
0,mickeykats,Mikhail Kats,321.0,97.0,https://mathstodon.xyz/@mickeykats,physics,mickeykats@mathstodon.xyz,mathstodon.xyz
1,josh,Josh Wells,1871.0,786.0,https://fediscience.org/@josh,sociology,josh@fediscience.org,fediscience.org
2,GuyBirkin,Guy Birkin,626.0,358.0,https://post.lurk.org/@GuyBirkin,art,GuyBirkin@post.lurk.org,post.lurk.org
3,Maurice_vTgln,Maurice van Tiggelen,183.0,42.0,https://fediscience.org/@Maurice_vTgln,physics,Maurice_vTgln@fediscience.org,fediscience.org
4,bnlawrence,Bryan Lawrence,577.0,472.0,https://mastodon.nz/@bnlawrence,unknown,bnlawrence@mastodon.nz,mastodon.nz


In [149]:
mentions = pd.read_csv('kuni_users_mentions.tsv', sep='\t')

In [150]:
pairs = mentions[['toot_author', 'mentioned_user_instance']]
pairs.values

array([['macaskillaf@fediscience.org',
        'nicoleacrowley@sciencemastodon.com'],
       ['macaskillaf@fediscience.org', 'neuralengine@mastodon.world'],
       ['macaskillaf@fediscience.org', 'NicoleCRust@mastodon.social'],
       ...,
       ['mer__edith@mastodon.world', 'chadloder@kolektiva.social'],
       ['mer__edith@mastodon.world', 'matthew_d_green@ioc.exchange'],
       ['mer__edith@mastodon.world', 'matthew_d_green@ioc.exchange']],
      dtype=object)

In [151]:
unique = np.unique(pairs.values)

In [208]:
fields_of_study = pd.read_csv('mastodon_users_wOpenAlex2.csv')[['mastodon_name', 'OpenAlex_account', 'x_concepts']]
fields_of_study['x_concepts'] = fields_of_study['x_concepts'].fillna('unknown')


def edit_mastodon_name(row):
    temp = row['mastodon_name']
    if temp.startswith("@"):
        temp = temp[1:]
    return temp


fields_of_study['mastodon_name'] = fields_of_study.apply(edit_mastodon_name, axis=1)

In [209]:
fields_of_study.head()

Unnamed: 0,mastodon_name,OpenAlex_account,x_concepts
0,HelmutBuergmann@mstdn.science,https://openalex.org/A2076598371,"[{'id': 'https://openalex.org/C86803240', 'wik..."
1,boshek@fosstodon.org,https://openalex.org/A2114692991,"[{'id': 'https://openalex.org/C86803240', 'wik..."
2,DenisDuboule@mas.to,https://openalex.org/A295410851,"[{'id': 'https://openalex.org/C54355233', 'wik..."
3,paulgkeil@mastodon.world,https://openalex.org/A2061685470,"[{'id': 'https://openalex.org/C15744967', 'wik..."
4,rider_jon@zirk.us,https://openalex.org/A2646935175,"[{'id': 'https://openalex.org/C142362112', 'wi..."


In [210]:
from ast import literal_eval

def get_concept(row):
    if row['x_concepts'] == 'unknown':
        return 'unknown'
    list_of_concepts = literal_eval(row['x_concepts'])
    concepts = []
    if len(list_of_concepts) == 0:
        return 'unknown'
    else:
        for concept in list_of_concepts:
            if concept['level'] == 0:
                return concept['display_name']

In [211]:
fields_of_study['x_concepts'] = fields_of_study.apply(get_concept, axis=1)

In [212]:
fields_of_study['instance'] = fields_of_study.apply(lambda row: row['mastodon_name'].split("@")[1], axis=1)
fields_of_study.head()

Unnamed: 0,mastodon_name,OpenAlex_account,x_concepts,instance
0,HelmutBuergmann@mstdn.science,https://openalex.org/A2076598371,Biology,mstdn.science
1,boshek@fosstodon.org,https://openalex.org/A2114692991,Biology,fosstodon.org
2,DenisDuboule@mas.to,https://openalex.org/A295410851,Biology,mas.to
3,paulgkeil@mastodon.world,https://openalex.org/A2061685470,Psychology,mastodon.world
4,rider_jon@zirk.us,https://openalex.org/A2646935175,Art,zirk.us


In [213]:
user_profiles = pd.concat([fields_of_study, profiles])

In [214]:
user_profiles.head(30)

Unnamed: 0,mastodon_name,OpenAlex_account,x_concepts,instance,username,display_name,followers_count,following_count,url
0,HelmutBuergmann@mstdn.science,https://openalex.org/A2076598371,Biology,mstdn.science,,,,,
1,boshek@fosstodon.org,https://openalex.org/A2114692991,Biology,fosstodon.org,,,,,
2,DenisDuboule@mas.to,https://openalex.org/A295410851,Biology,mas.to,,,,,
3,paulgkeil@mastodon.world,https://openalex.org/A2061685470,Psychology,mastodon.world,,,,,
4,rider_jon@zirk.us,https://openalex.org/A2646935175,Art,zirk.us,,,,,
5,magnuspalmblad@fediscience.org,https://openalex.org/A2058370087,Biology,fediscience.org,,,,,
6,belovedblackram@ecoevo.social,https://openalex.org/A2656009222,Biology,ecoevo.social,,,,,
7,AxelVisel@genomic.social,https://openalex.org/A217167259,Biology,genomic.social,,,,,
8,ohara@hcommons.social,https://openalex.org/A2284742525,Philosophy,hcommons.social,,,,,
9,tylermorgan@mathstodon.xyz,https://openalex.org/A2293578381,Psychology,mathstodon.xyz,,,,,


In [215]:
print(len(user_profiles))
user_profiles = user_profiles.drop_duplicates(subset='mastodon_name')
print(len(user_profiles))

7605
5958


In [216]:
g_mentions = ig.Graph(directed=True)
g_mentions.add_vertices(len(user_profiles))
g_mentions.vs['name'] = user_profiles['mastodon_name'].values

valid_pairs = []
for idx, pair in pairs.iterrows():
    if pair.toot_author in g_mentions.vs['name'] and pair.mentioned_user_instance in g_mentions.vs['name']:
        valid_pairs.append(pair)

In [217]:
g_mentions.add_edges(valid_pairs)

In [218]:
g_mentions.vs['instances'] = user_profiles['instance'].values

In [219]:
g_mentions.vs['concepts'] = concepts

In [220]:
count

4621

In [221]:
xnet.igraph2xnet(g_mentions, 'g_mentions.xnet')