In [1]:
import pandas as pd
from nltk import word_tokenize
from gensim.models import Word2Vec



In [2]:
# Reading the emotions data set into a pandas dataframe and renaming the text column to processed_tweet_text to match other data file
df = pd.read_csv("go_emotions_dataset.csv")

df = df.drop(df[df['example_very_unclear'] == True].index)
df = df.rename(columns={'text': 'processed_tweet_text'})

df.head()

Unnamed: 0,id,processed_tweet_text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,That game hurt.,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,eeibobj,Man I love reddit.,False,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,eespn2i,Right? Considering it’s such an important docu...,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Taking possible emotions that exist in the data set and assigning them to positive/neutral and negative categories for whether they could exhibit cyberbullying
non_neg_cols = [
    "admiration",
    "amusement",
    "approval",
    "caring",
    "desire",
    "excitement",
    "gratitude",
    "joy",
    "love",
    "optimism",
    "pride",
    "love",
    "fear",
    "grief",
    "embarrassment",
    "nervousness",
    "remorse",
    "sadness",
]

neg_cols = ["anger", "annoyance", "disgust"]

df['cyberbullying_type'] = 0

# Classify based on the presence of negative emotions
df.loc[df[neg_cols].sum(axis=1) > 0, 'cyberbullying_type'] = 1

# Summary of the classification of the data set
classification_summary = df['cyberbullying_type'].value_counts(normalize=True) * 100

print(df['cyberbullying_type'].value_counts())
print(classification_summary)

# Example of a row classified as not cyberbullying (0)
not_cyberbullying_example = df[df['cyberbullying_type'] == 0].iloc[0]

print(not_cyberbullying_example)

# Example of a row classified as cyberbullying (1)
cyberbullying_example = df[df['cyberbullying_type'] == 1].iloc[0]

print(cyberbullying_example)

cyberbullying_type
0    183139
1     24675
Name: count, dtype: int64
cyberbullying_type
0    88.126401
1    11.873599
Name: proportion, dtype: float64
id                              eew5j0j
processed_tweet_text    That game hurt.
example_very_unclear              False
admiration                            0
amusement                             0
anger                                 0
annoyance                             0
approval                              0
caring                                0
confusion                             0
curiosity                             0
desire                                0
disappointment                        0
disapproval                           0
disgust                               0
embarrassment                         0
excitement                            0
fear                                  0
gratitude                             0
grief                                 0
joy                                   0
love     

In [4]:
# Combining the processed tweet text and the cyberbullying classification into a new dataframe
google_emotion_df = df[['cyberbullying_type', 'processed_tweet_text']]

# Saving the new dataframe to a csv file
FILE = 'google_cyberbullying_dataset.csv'
google_emotion_df.to_csv(FILE, index=False)