In [1]:
# https://github.com/boilingpenguin/tumblr-scraper
# https://github.com/tumblr/docs/blob/master/api.md

In [1]:
import pytumblr
import calendar
import time
import pandas as pd
from tqdm import tqdm
import copy

In [2]:
with open('../api_key.keys') as f:
	api_key = f.readline()

In [3]:
file_path = "../tonetags.txt"

tone_tags = {}

with open(file_path, 'r', encoding='utf-8') as file:
	for line in file:
		key, value = line.strip().split(':')
		tone_tags[key.strip()] = value.strip().split(',')


for key, values in tone_tags.items():
	tone_tags[key] = [item for item in tone_tags[key] if '/' not in item]

for key, values in tone_tags.items():
	print(f"{key}: {values}")

affectionate: ['affectionate']
copypasta: ['copypasta', 'copypaste', 'repeated text']
clickbait: ['clickbait']
fake: ['fake']
genuine: ['genuine']
genuine question: ['genuine question']
half joking: ['half joking', 'half-joking', 'half joke', 'half-joke', 'halfjoke']
hyperbole: ['hyperbole']
inside joke: ['inside joke', 'insidejoke', 'insidejoking', 'inside-joke', 'inside-joking']
joking: ['joking', 'joke']
lyrics: ['lyrics']
light-hearted: ['light-hearted', 'light hearted', 'lighthearted']
literal: ['literal', 'literally']
little upset: ['little upset', 'littleupset', 'little-upset']
metaphorical: ['metaphorical']
not a vent: ['not a vent', 'notavent', 'not-a-vent']
nobody here: ['nobodyhere', 'nobody-here']
negative connotation: ['negative connotation', 'negetiveconnotation', 'negative-connotation']
neutral connotation: ['neutral connotation', 'neutralconnotation', 'neutral-connotation']
not forced: ['not forced', 'not-forced', 'notforced']
not mad: ['not mad', 'not-mad', 'notmad']
n

In [4]:
break_bool = False

for key, values in tone_tags.items():
	if not break_bool:
		list_of_values = copy.copy(values)
		for value in list_of_values:
			if value != "reference":
				values.remove(value)
			else:
				break_bool = True
				break
	else:
		break

for key, values in tone_tags.items():
	print(f"{key}: {values}")

affectionate: []
copypasta: []
clickbait: []
fake: []
genuine: []
genuine question: []
half joking: []
hyperbole: []
inside joke: []
joking: []
lyrics: []
light-hearted: []
literal: []
little upset: []
metaphorical: []
not a vent: []
nobody here: []
negative connotation: []
neutral connotation: []
not forced: []
not mad: []
not passive aggressive: []
not subtweeting: []
non-serious: []
non-sexual intent: []
platonic: []
passive aggressive: []
positive connotation: []
quote: []
romantic: []
reference: ['reference']
rhetorical: ['rhetorical', 'rhetorical question', 'rhetorical-question', 'rhetoricalquestion']
sarcastic: ['sarcastic', 'sarcasm']
serious: ['serious']
sexual intent: ['sexual intent', ' sexualintent', 'sexual-intent']
teasing: ['teasing']
threat: ['threat']
ironic: ['ironic']
not at you: ['not at you', 'not-at-you', 'notatyou']
nothing personal: ['nothing personal', 'nothing-personal', 'nothingpersonal']


In [5]:
tumblr = pytumblr.TumblrRestClient(api_key)
tumblr_filter = 'text'
start_time_search = calendar.timegm(time.gmtime())

output_path = '../datasets/data_from_tumblr/'
file_path = output_path + 'TumblrSearch'
tumblr_df = pd.DataFrame(columns=['timestamp', 'URL', 'blogName', 'title', 'tags', 'text'])
num_of_query = 1
api_limit_reached = False
start = True

for tag, tags in tone_tags.items():
	if start:
		oldest_time = 1700085687
		start = False
	else:
		oldest_time = start_time_search
	for tone_tag in tags:
		results = []
		print(f"Start of parsing {tone_tag}\n")
		while True:
			searchResults = tumblr.tagged(tone_tag, filter=tumblr_filter, before=oldest_time)
			for post in searchResults:
				if post == 'meta':
					print("API calls limit reached")
					print(f"Current tag: {tone_tag}")
					print(f"Current oldest time: {oldest_time}")
					api_limit_reached = True
					break
				blog_name = post['blog_name']
				date = post['date']
				url = post['post_url']
				try:
					title = post['title']
				except:
					title = "Couldn't Get Title"
				try:
					tags = post['tags']
				except:
					tags = "Couldn't Get tags"
				try:
					body = post['body']
				except:
					body = "Couldn't Get Post Body"
				results.append((date, url, blog_name, title, tags, body))
				oldest_time = post['timestamp']

			if num_of_query % 10 == 0:
				print(f"Number of query: {num_of_query}")

			num_of_query += 1

			if api_limit_reached:
				results_df = pd.DataFrame(results, columns=['timestamp', 'URL', 'blogName', 'title', 'tags', 'text'])
				tumblr_df = pd.concat([tumblr_df, results_df])
				tumblr_df.to_csv(file_path + '-' + str(time.time()) + '.csv', index=False)
				results = []
				print("Sleep for 3600 seconds to try again")
				for _ in tqdm(range(60)):
					time.sleep(60)
				api_limit_reached = False
			
			else:
				if len(searchResults) < 20:
					print(f"End of tag {tone_tag}\n")
					break
			
		results_df = pd.DataFrame(results, columns=['timestamp', 'URL', 'blogName', 'title', 'tags', 'text'])
		tumblr_df = pd.concat([tumblr_df, results_df])
		tumblr_df.to_csv(file_path + '-' + str(time.time()) + '.csv', index=False)

Start of parsing reference
Number of query: 10
Number of query: 20
Number of query: 30
Number of query: 40
Number of query: 50
Number of query: 60
Number of query: 70
Number of query: 80
Number of query: 90
Number of query: 100
Number of query: 110
Number of query: 120
Number of query: 130
Number of query: 140
Number of query: 150
Number of query: 160
Number of query: 170
Number of query: 180
Number of query: 190
Number of query: 200
Number of query: 210
Number of query: 220
Number of query: 230
Number of query: 240
Number of query: 250
Number of query: 260
Number of query: 270
Number of query: 280
Number of query: 290
Number of query: 300
Number of query: 310
Number of query: 320
Number of query: 330
Number of query: 340
Number of query: 350
Number of query: 360
Number of query: 370
Number of query: 380
Number of query: 390
Number of query: 400
Number of query: 410
Number of query: 420
Number of query: 430
Number of query: 440
Number of query: 450
Number of query: 460
Number of query:

100%|██████████| 60/60 [1:00:00<00:00, 60.00s/it]


Number of query: 1010
Number of query: 1020
Number of query: 1030
Number of query: 1040
Number of query: 1050
Number of query: 1060
Number of query: 1070
Number of query: 1080
Number of query: 1090
Number of query: 1100
Number of query: 1110
Number of query: 1120
Number of query: 1130
Number of query: 1140
Number of query: 1150
Number of query: 1160
Number of query: 1170
Number of query: 1180
Number of query: 1190
Number of query: 1200
Number of query: 1210
Number of query: 1220
Number of query: 1230
Number of query: 1240
Number of query: 1250
Number of query: 1260
Number of query: 1270
Number of query: 1280
Number of query: 1290
Number of query: 1300
Number of query: 1310
Number of query: 1320
Number of query: 1330
Number of query: 1340
Number of query: 1350
Number of query: 1360
Number of query: 1370
Number of query: 1380
Number of query: 1390
Number of query: 1400
Number of query: 1410
Number of query: 1420
Number of query: 1430
Number of query: 1440
Number of query: 1450
Number of 

100%|██████████| 60/60 [1:00:00<00:00, 60.00s/it]


Number of query: 2010
Number of query: 2020
Number of query: 2030
Number of query: 2040
Number of query: 2050
Number of query: 2060
End of tag rhetorical question
Start of parsing rhetorical-question
Number of query: 2070
End of tag rhetorical-question
Start of parsing rhetoricalquestion
End of tag rhetoricalquestion
Start of parsing sarcastic
End of tag sarcastic
Start of parsing sarcasm
Number of query: 2080
Number of query: 2090
Number of query: 2100
Number of query: 2110
Number of query: 2120
Number of query: 2130
Number of query: 2140
Number of query: 2150
Number of query: 2160
Number of query: 2170
Number of query: 2180
Number of query: 2190
Number of query: 2200
Number of query: 2210
Number of query: 2220
Number of query: 2230
End of tag sarcasm
Start of parsing serious
Number of query: 2240
Number of query: 2250
Number of query: 2260
Number of query: 2270
Number of query: 2280
Number of query: 2290
Number of query: 2300
Number of query: 2310
Number of query: 2320
Number of quer

100%|██████████| 60/60 [1:00:02<00:00, 60.04s/it]


Number of query: 3010
Number of query: 3020
Number of query: 3030
Number of query: 3040
Number of query: 3050
Number of query: 3060
Number of query: 3070
Number of query: 3080
Number of query: 3090
Number of query: 3100
End of tag nothing personal
Start of parsing nothing-personal
End of tag nothing-personal
Start of parsing nothingpersonal
End of tag nothingpersonal
