In [1]:
# https://github.com/boilingpenguin/tumblr-scraper
# https://github.com/tumblr/docs/blob/master/api.md

In [2]:
import pytumblr
import calendar
import time
import pandas as pd
from tqdm import tqdm

In [3]:
with open('../api_key.keys') as f:
	api_key = f.readline()

In [4]:
file_path = "../tonetags.txt"

tone_tags = {}

with open(file_path, 'r', encoding='utf-8') as file:
	for line in file:
		key, value = line.strip().split(':')
		tone_tags[key.strip()] = value.strip().split(',')


for key, values in tone_tags.items():
	tone_tags[key] = [item for item in tone_tags[key] if '/' not in item]

for key, values in tone_tags.items():
	print(f"{key}: {values}")

affectionate: ['affectionate']
copypasta: ['copypasta', 'copypaste', 'repeated text']
clickbait: ['clickbait']
fake: ['fake']
genuine: ['genuine']
genuine question: ['genuine question']
half joking: ['half joking', 'half-joking', 'half joke', 'half-joke', 'halfjoke']
hyperbole: ['hyperbole']
inside joke: ['inside joke', 'insidejoke', 'insidejoking', 'inside-joke', 'inside-joking']
joking: ['joking', 'joke']
lyrics: ['lyrics']
light-hearted: ['light-hearted', 'light hearted', 'lighthearted']
literal: ['literal', 'literally']
little upset: ['little upset', 'littleupset', 'little-upset']
metaphorical: ['metaphorical']
not a vent: ['not a vent', 'notavent', 'not-a-vent']
nobody here: ['nobodyhere', 'nobody-here']
negative connotation: ['negative connotation', 'negetiveconnotation', 'negative-connotation']
neutral connotation: ['neutral connotation', 'neutralconnotation', 'neutral-connotation']
not forced: ['not forced', 'not-forced', 'notforced']
not mad: ['not mad', 'not-mad', 'notmad']
n

In [5]:
tumblr = pytumblr.TumblrRestClient(api_key)
tumblr_filter = 'text'
start_time_search = calendar.timegm(time.gmtime())

output_path = '../datasets/data_from_tumblr/'
file_path = output_path + 'TumblrSearch'
tumblr_df = pd.DataFrame(columns=['timestamp', 'URL', 'blogName', 'title', 'tags', 'text'])
num_of_query = 1
api_limit_reached = False

for tag, tags in tone_tags.items():
	oldest_time = start_time_search
	for tone_tag in tags:
		results = []
		print(f"Start of parsing {tone_tag}\n")
		while True:
			searchResults = tumblr.tagged(tone_tag, filter=tumblr_filter, before=oldest_time)
			for post in searchResults:
				if post == 'meta':
					print("API calls limit reached")
					print(f"Current tag: {tone_tag}")
					api_limit_reached = True
					break
				blog_name = post['blog_name']
				date = post['date']
				url = post['post_url']
				try:
					title = post['title']
				except:
					title = "Couldn't Get Title"
				try:
					tags = post['tags']
				except:
					tags = "Couldn't Get tags"
				try:
					body = post['body']
				except:
					body = "Couldn't Get Post Body"
				results.append((date, url, blog_name, title, tags, body))
				oldest_time = post['timestamp']

			if api_limit_reached:
				results_df = pd.DataFrame(results, columns=['timestamp', 'URL', 'blogName', 'title', 'tags', 'text'])
				tumblr_df = pd.concat([tumblr_df, results_df])
				tumblr_df = tumblr_df.to_csv(file_path + '-' + str(time.time()) + '.csv')
				results = []
				print("Sleep for 3600 seconds to try again")
				for _ in tqdm(range(3601)):
					time.sleep(1)
			
			if num_of_query % 10:
				print(f"Number of query: {num_of_query}")
				
			num_of_query += 1
			
			if len(searchResults) < 20:
				print(f"End of tag {tone_tag}\n")
				break

		results_df = pd.DataFrame(results, columns=['timestamp', 'URL', 'blogName', 'title', 'tags', 'text'])
		tumblr_df = pd.concat([tumblr_df, results_df])
		tumblr_df = tumblr_df.to_csv(file_path + '-' + str(time.time()) + '.csv')

Start of parsing affectionate
API calls limit reached
Current tag: affectionate
Sleep for 3600 seconds to try again


  0%|          | 12/3601 [00:13<1:04:52,  1.08s/it]


KeyboardInterrupt: 