In [1]:
import pandas as pd
import numpy as np
import copy
import re
import string
from langdetect import detect
%matplotlib inline

from tqdm.auto import tqdm

tqdm.pandas()

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Me\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path_dataset = "../datasets/tonetags_dataset_tumblr.csv"

In [3]:
def str_to_list(value):
    list_values = value.strip('[]').split(', ')
    cleaned_list_values = [item[1:-1] for item in list_values]
    return cleaned_list_values

In [4]:
df1 = pd.read_csv(path_dataset, index_col=0, converters={"tags": str_to_list})
# df = pd.read_csv(path_dataset, index_col=0)

In [5]:
df1.shape

(379191, 7)

In [6]:
df1.head()

Unnamed: 0.1,Unnamed: 0,timestamp,URL,blogName,title,tags,text
0,0.0,2024-03-31 14:49:00 GMT,https://two-hearts-beat.tumblr.com/post/746477...,two-hearts-beat,,"[🙈, genuine question]",we’ve all talked about the ‘gifted child’ to s...
1,1.0,2024-03-31 14:12:40 GMT,https://miniatureketchupbottles.tumblr.com/pos...,miniatureketchupbottles,,"[genuine question, like actually where do peop...",Is the concept of straight girls religiously w...
2,2.0,2024-03-31 09:08:46 GMT,https://www.tumblr.com/blog/view/writtenkiss/7...,writtenkiss,,"[꒰͡ 𝐒𝐇𝐈𝐓𝐏𝐎𝐒𝐓𝐈𝐍𝐆 — piofiore. 🌱, genuine questio...",I am sincerely curious. Has some of you ever w...
3,3.0,2024-03-31 05:06:36 GMT,https://silly-abro.tumblr.com/post/74644098511...,silly-abro,,"[Scene, emo, rawr x3, genuine question, I gues...",Bro idk how people look so fucking good in the...
4,4.0,2024-03-31 03:18:52 GMT,https://www.tumblr.com/blog/view/bet-h/7464342...,bet-h,,"[why do tumblr talk in tags, like why, genuine...",Ur mom


In [7]:
df2 = df1[['tags', 'text']]

In [8]:
df2.head()

Unnamed: 0,tags,text
0,"[🙈, genuine question]",we’ve all talked about the ‘gifted child’ to s...
1,"[genuine question, like actually where do peop...",Is the concept of straight girls religiously w...
2,"[꒰͡ 𝐒𝐇𝐈𝐓𝐏𝐎𝐒𝐓𝐈𝐍𝐆 — piofiore. 🌱, genuine questio...",I am sincerely curious. Has some of you ever w...
3,"[Scene, emo, rawr x3, genuine question, I gues...",Bro idk how people look so fucking good in the...
4,"[why do tumblr talk in tags, like why, genuine...",Ur mom


In [9]:
file_path = "../tonetags.txt"

tone_tags = {}

with open(file_path, 'r', encoding='utf-8') as file:
	for line in file:
		key, value = line.strip().split(':')
		tone_tags[key.strip()] = value.strip().split(',')

for key, values in tone_tags.items():
	print(f"{key}: {values}")

affectionate: ['affectionate', '/a']
copypasta: ['copypasta', 'copypaste', 'repeated text', '/c']
clickbait: ['clickbait', '/cb']
fake: ['fake', '/f']
genuine: ['genuine', '/g', '/gen']
genuine question: ['genuine question', '/genq']
half joking: ['half joking', 'half-joking', 'half joke', 'half-joke', 'halfjoke', '/hf']
hyperbole: ['hyperbole', '/hyp']
inside joke: ['inside joke', 'insidejoke', 'insidejoking', 'inside-joke', 'inside-joking', '/ij']
joking: ['joking', 'joke', '/j']
lyrics: ['lyrics', '/l', '/ly']
light-hearted: ['light-hearted', 'light hearted', 'lighthearted', '/lh']
literal: ['literal', 'literally', '/li']
little upset: ['little upset', 'littleupset', 'little-upset', '/lu-a', '/lu']
metaphorical: ['metaphorical', '/m']
not a vent: ['not a vent', 'notavent', 'not-a-vent', '/nav']
nobody here: ['nobodyhere', 'nobody-here', '/nbh']
negative connotation: ['negative connotation', 'negetiveconnotation', 'negative-connotation', '/neg', '/ng']
neutral connotation: ['neutral 

In [10]:
df3 = df2.copy()

In [11]:
def replace_values(tags_list):
    return [key for key, value in tone_tags.items() if any(tag.lower() in value for tag in tags_list)]

In [12]:
df3['tags'] = df2['tags'].apply(replace_values)

In [13]:
df3.head()

Unnamed: 0,tags,text
0,[genuine question],we’ve all talked about the ‘gifted child’ to s...
1,[genuine question],Is the concept of straight girls religiously w...
2,[genuine question],I am sincerely curious. Has some of you ever w...
3,[genuine question],Bro idk how people look so fucking good in the...
4,[genuine question],Ur mom


In [14]:
list_with_empty_tags = df3[df3['tags'].apply(lambda x: len(x) == 0)].index.tolist()
list_with_empty_tags

[1333,
 21152,
 91564,
 1385548,
 1700697,
 1710109,
 1710110,
 1710111,
 1710112,
 1710113,
 1710114,
 1710115,
 1710116,
 1710117,
 1710118,
 1710119,
 1710120,
 1710121,
 1710122,
 1710123,
 1710124,
 1710125,
 1710126,
 1710127,
 1710128,
 1710129,
 1710130,
 1710131,
 1710132,
 1710133,
 1710134,
 1710135,
 1710136]

In [15]:
for id in list_with_empty_tags:
    print(id, df1.loc[id]['tags'])

1333 ["i'm tweaking"]
21152 ["i'm tweaking"]
91564 ["i'm tweaking"]
1385548 ['']
1700697 ["yeah alex it's called seasonal depression", 'this isnt a vent its just a weird babble about this', 'alex babbling']
1710109 ['bored', 'nobody here', 'boyfriend', 'boyfriend hoodie', 'hoodie', 'wine', 'sad times']
1710110 ['welp', 'nobody here', 'gonna go back to watching digimon']
1710111 ['nobody here', '=', 'offline']
1710112 ['sunset corp', 'nobody here']
1710113 ['Nobody here', 'irl stuff', 'just so angry']
1710114 ['me', 'personal', 'work', 'SO SLOW', 'nobody here', 'writing tutor', 'writing workshop', 'writing studio', 'should be', 'making money', 'working']
1710115 ['nobody here', 'chris de burgh', 'radiovalerie', 'late night', 'whimsy']
1710116 ['blu mar ten', 'nobody here', 'kastle', 'remix', 'could you want me', 'purple']
1710117 ['nobody here', 'lady in red', 'graphics', 'chris deburgh']
1710118 ['NOBODY HERE', 'audio']
1710119 ['Blue Mar Ten', 'Nobody here', 'Kastle']
1710120 ['nobody

In [16]:
df4 = df3[df3['tags'].apply(lambda x: len(x) != 0)]

In [17]:
df5 = df4.explode('tags')

In [18]:
df5.head()

Unnamed: 0,tags,text
0,genuine question,we’ve all talked about the ‘gifted child’ to s...
1,genuine question,Is the concept of straight girls religiously w...
2,genuine question,I am sincerely curious. Has some of you ever w...
3,genuine question,Bro idk how people look so fucking good in the...
4,genuine question,Ur mom


In [19]:
df6 = df5.drop_duplicates()

In [20]:
df6.head()

Unnamed: 0,tags,text
0,genuine question,we’ve all talked about the ‘gifted child’ to s...
1,genuine question,Is the concept of straight girls religiously w...
2,genuine question,I am sincerely curious. Has some of you ever w...
3,genuine question,Bro idk how people look so fucking good in the...
4,genuine question,Ur mom


In [21]:
df7 = df6.dropna(subset=['text'])

In [22]:
df7.head()

Unnamed: 0,tags,text
0,genuine question,we’ve all talked about the ‘gifted child’ to s...
1,genuine question,Is the concept of straight girls religiously w...
2,genuine question,I am sincerely curious. Has some of you ever w...
3,genuine question,Bro idk how people look so fucking good in the...
4,genuine question,Ur mom


In [23]:
tag_size = {}

for tag in df7.tags.unique().tolist():
    tag_size[tag] = df7[df7['tags'] == tag].shape[0]

In [24]:
dict(sorted(tag_size.items(), key=lambda x: x[1]))

{'neutral connotation': 1,
 'not subtweeting': 6,
 'nobody here': 10,
 'negative connotation': 29,
 'little upset': 40,
 'not passive aggressive': 43,
 'not at you': 43,
 'teasing': 48,
 'positive connotation': 53,
 'not forced': 61,
 'metaphorical': 91,
 'non-serious': 209,
 'threat': 244,
 'hyperbole': 315,
 'fake': 322,
 'light-hearted': 452,
 'affectionate': 843,
 'not mad': 1400,
 'ironic': 1441,
 'not a vent': 1761,
 'genuine': 1849,
 'clickbait': 2289,
 'rhetorical': 2299,
 'half joking': 2522,
 'sarcastic': 2735,
 'copypasta': 2773,
 'serious': 4430,
 'passive aggressive': 4562,
 'nothing personal': 4857,
 'romantic': 5228,
 'inside joke': 5599,
 'genuine question': 7180,
 'platonic': 10105,
 'reference': 13706,
 'joking': 13998,
 'lyrics': 18911,
 'quote': 35245,
 'literal': 61888}

In [25]:
printable = set(string.printable)

df8 = df7.copy()
df8["text"] = df8.text.apply(lambda y: ''.join(filter(lambda x: x in printable, y)))
df8.head()

Unnamed: 0,tags,text
0,genuine question,weve all talked about the gifted child to stru...
1,genuine question,Is the concept of straight girls religiously w...
2,genuine question,I am sincerely curious. Has some of you ever w...
3,genuine question,Bro idk how people look so fucking good in the...
4,genuine question,Ur mom


In [26]:
def langdetect(text):
    try:
        lang = detect(text)
        if lang != "en":
            return False
        else:
            return True
    except:
        return False

In [27]:
df9 = df8[df8.text.apply(langdetect)]

In [28]:
df9.head()

Unnamed: 0,tags,text
0,genuine question,weve all talked about the gifted child to stru...
1,genuine question,Is the concept of straight girls religiously w...
2,genuine question,I am sincerely curious. Has some of you ever w...
3,genuine question,Bro idk how people look so fucking good in the...
5,genuine question,Advice for a beginner witch who has no supplie...


In [39]:
df9.tags.value_counts() / len(df9) * 100

tags
literal                   30.592835
quote                     16.093070
lyrics                     8.853589
joking                     6.667662
reference                  6.580716
platonic                   5.039153
genuine question           3.699219
inside joke                2.463301
nothing personal           2.403026
passive aggressive         2.322480
serious                    2.284074
romantic                   2.110715
copypasta                  1.390074
sarcastic                  1.371405
half joking                1.294060
rhetorical                 1.178309
clickbait                  1.114833
genuine                    0.919071
not a vent                 0.865729
ironic                     0.700371
not mad                    0.694504
affectionate               0.400060
light-hearted              0.235235
hyperbole                  0.162691
fake                       0.137087
threat                     0.117351
non-serious                0.103482
metaphorical           

In [29]:
df9_4 = df9[(df9["tags"] != "quote") & (df9["tags"] != "literal")]

In [30]:
from autocorrect import Speller
spell = Speller()

def correct_text(text):
    print("\n"+text)
    return spell(text)

In [31]:
df9_5 = df9_4.copy().reset_index(drop=True)
df9_5['text'] = df9_5["text"].progress_apply(lambda x: correct_text(x))

  0%|          | 0/99949 [00:00<?, ?it/s]


weve all talked about the gifted child to struggling adult pipeline, but is there also an 'old soul to immature-seeming adult pipeline? (or is that just being on the spectrum?)

Is the concept of straight girls religiously watching gay shows really weird or am I just a horrible person

I am sincerely curious. Has some of you ever written a piofiore fanfiction? Because. I need content of this game. Please drop the title if you did sOBS

Bro idk how people look so fucking good in the fake nerd glasses

Advice for a beginner witch who has no supplies for what you can do to deal with period related issues? Its looking like my blockers arent as effective this month and well dysphoria, mood swings and cramps look to be in my future

I am trans masc if that affects the magic you know of

How do I grow enough at my apartment complex to reduce my dependence on grocery stores

what am I doing tonight 

get drunk and give myself a haircut

get high and do chores

get crossed and watch one piece


KeyboardInterrupt: 

In [32]:
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return [w for w in word_tokens if not w in stop_words]

In [33]:
df10 = df9_5.copy()
df10["text"] = df10.text.apply(remove_stopwords)

In [34]:
df10.head()

Unnamed: 0,tags,text
0,genuine question,"[weve, talked, gifted, child, struggling, adul..."
1,genuine question,"[Is, concept, straight, girls, religiously, wa..."
2,genuine question,"[I, sincerely, curious, ., Has, ever, written,..."
3,genuine question,"[Bro, idk, people, look, fucking, good, fake, ..."
5,genuine question,"[Advice, beginner, witch, supplies, deal, peri..."


In [35]:
def remove(tokens):
    if len(tokens) <= 5:
        return False
    else:
        total = 0
        total_chars = 0
        for token in tokens:
            total += len(token)
            for char in token:
                if char.isalpha():
                    total_chars += 1
        if total_chars / total >= 0.8:
            return True
        else:
            return False

In [36]:
df11 = df10.copy()
df11 = df11[df11["text"].apply(remove)]

In [37]:
df11.shape

(87465, 2)

In [38]:
tag_size = {}

for tag in df11.tags.unique().tolist():
    tag_size[tag] = df11[df11['tags'] == tag].shape[0]

tag_size = dict(sorted(tag_size.items(), key=lambda x: x[1]))
tag_size

df11.tags.value_counts() / len(df11) * 100

tags
lyrics                    16.447722
reference                 12.079117
joking                    11.789859
platonic                  10.080604
genuine question           6.878180
passive aggressive         4.686446
nothing personal           4.605271
serious                    4.601841
inside joke                4.104499
romantic                   3.787801
copypasta                  2.845710
sarcastic                  2.676499
half joking                2.519865
rhetorical                 2.183731
clickbait                  2.128852
genuine                    1.788144
not a vent                 1.604070
ironic                     1.358258
not mad                    1.334248
affectionate               0.661979
light-hearted              0.473332
hyperbole                  0.328131
fake                       0.259532
threat                     0.201223
non-serious                0.196650
metaphorical               0.086892
not forced                 0.056022
positive connotation   

In [None]:
df13 = df11.copy()

for key, value in tag_size.items():
    if value < 1000:
        df13 = df13[df13["tags"] != key]

In [None]:
tag_size = {}

for tag in df13.tags.unique().tolist():
    tag_size[tag] = df13[df13['tags'] == tag].shape[0]

tag_size = dict(sorted(tag_size.items(), key=lambda x: x[1]))
tag_size

In [None]:
df13.shape

In [None]:
df13.tags

In [None]:
df13.tags.value_counts() / len(df13) * 100

In [None]:
df13.to_csv("../datasets/tonetags_dataset_tumblr_clean_corrected_text.csv", index=False)