# Comprehensive functions used for collecting the dataset

### Codes below are used for collecting information of Tumblr users and their posts

First import all necessary libraries and get signed in and connect to API

In [None]:
# imports
!pip install pytumblr

from collector import new_oauth
from future import standard_library
standard_library.install_aliases()
from builtins import input
import pytumblr
import yaml
import os
from requests_oauthlib import OAuth1Session
import csv


set up the Tumblr API

In [None]:
yaml_path = os.path.expanduser('~') + '/.tumblr'
alternate_path = os.path.expanduser('~') + '/.pytumblr'

if os.path.isdir(yaml_path) or os.path.exists(alternate_path):
    yaml_path = alternate_path

if not os.path.exists(yaml_path):
    tokens = new_oauth(yaml_path)
else:
    yaml_file = open(yaml_path, "r")
    tokens = yaml.safe_load(yaml_file)
    yaml_file.close()

client = pytumblr.TumblrRestClient(
    tokens['consumer_key'],
    tokens['consumer_secret'],
    tokens['oauth_token'],
    tokens['oauth_token_secret']
)

define functions

In [None]:
# takes a list of user names
# output fields to be recorded
def write_each_row(user_name, writer):
    current_user = client.blog_info(user_name)['blog']
    if 'errors' in current_user.keys():
        return
    writer.writerow({'user_name': user_name, # username
                     'name': current_user['title'], # user's customized name
                     'avatar': current_user['avatar'][1]['url'], # avatar url for downloading
                     'total_posts': current_user['total_posts'], # number of posts
                     'description': current_user['description'], # user self description
                     'allow_question': current_user['ask'], # allow unfollowing to ask questions
                     'allow_question_anon': current_user['ask_anon'] # allow unfollowing to ask questions anonymously
                     })

# parsing names
# choose one post
def parsing_names(blog, post_id, accessed):
    one_note = client.notes(blog,post_id)['notes']
    # get all note info
    name_list=[blog]
    for each in one_note:
        # make sure no duplicate
        if each['blog_name'] not in name_list and each['blog_name'] not in accessed:
            name_list.append(each['blog_name'])
    return name_list

# add user info to the given file
def add_to_file(csv_file, name_list):
    accessed = []
    with open(csv_file, 'a', newline='') as csvfile:
        fieldnames = ['user_name', 'name', 'avatar', 'total_posts', 'description', 'allow_question', 'allow_question_anon']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        added_name = name_list
        all_names = added_name
        n = 0
        while n<1000 and len(added_name)>2:
            for user in added_name:
                if user not in accessed and user not in all_names:
                    write_each_row(user, writer)
                    accessed.append(user)
            blog = added_name[len(added_name)-1]
            post_id = client.posts(blog)['posts'][1]['id']
            added_name = parsing_names(blog, post_id, accessed)
            all_names.append(added_name)
            n = n+1


fill in the customized file name and user names

In [None]:
# create file and header
csv_file = ""
names = []
with open(csv_file, 'w', newline='') as csvfile:
    fieldnames = ['user_name', 'name', 'avatar', 'total_posts', 'description', 'allow_question', 'allow_question_anon']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for user in names:
        write_each_row(user, writer)

In [None]:
# enter the initial users to collect information. size > 2
add_to_file(csv_file, []) # replace with customized input

# Below are codes used for generating posts info files

In [None]:
# this method is for adding posts info
# takes a user name, collect all its posts
def write_each_row_posts(client_name, writer):
    user = client.posts(client_name)
    if 'errors' in user.keys():
        return
    for single_post in user['posts']:

        notes_likes = client.notes(client_name,single_post['id'], mode = 'likes')['total_notes']
        
        writer.writerow({'blog_name': client_name, # user who posts
                     'id': single_post['id_string'], # unique id for reference
                     'date': single_post['date'], # date posted
                     'type': single_post['type'], # blog type
                     'is_blocks_post_format': single_post['is_blocks_post_format'],
                     'tags': single_post['tags'], # tags of this post
                     'note_count': single_post['note_count'], # number of notes
                     'total_likes': notes_likes, # number of likes
                     'total_reblogs': single_post['note_count']-notes_likes, # number of reblogs
                     'summary': single_post['summary'], # summary of the post
                     'slug': single_post['slug'],
                     'interactability_reblog': single_post['interactability_reblog'],
                     'interactability_blaze': single_post['interactability_blaze'],
                })


In [None]:
# define list of names to be collected
names_for_posts = []

In [None]:
# create csv file for demo purpose
csv_file = 'post_output.csv'
with open(csv_file, 'w', newline='') as csvfile:
    fieldnames = ['blog_name', 'id', 'date', 'type', 'is_blocks_post_format', 'tags', 'note_count', 'total_likes', 'total_reblogs', 'summary',
            'slug', 'interactability_reblog', 'interactability_blaze']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for user in names_for_posts:
        write_each_row_posts(user, writer)
        names_for_posts.remove(user)

In [None]:
# optionally get original authors for a list of blogs.
# input posts as a list of [blog, id]
def get_original_author(posts):
    oriauthor = [] #output list
    for i in range(len(posts)):
        blog = posts[0]
        id = posts[1]
        single_post=client.posts(blog, id)
        if 'errors' in single_post.keys():
            if single_post['meta']['status'] == 404:
                author = ''
            else:
                print('error at index: ',i)
                pass
        else: 
            trail = single_post['posts'][0]['trail']
            if len(trail) < 1:
                author = ''
            else:
                author = trail[0]['blog']['name']
            print(author)

        oriauthor.append(author)
    return oriauthor